bank <- read_delim("bank-additional.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE)
## Rows: 4119 Columns: 21
## -- Column specification --------------------------------------------------------
## Delimiter: ";"
## chr (11): job, marital, education, default, housing, loan, contact, month, d...
## dbl (10): age, duration, campaign, pdays, previous, emp.var.rate, cons.price...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(bank)
## spec_tbl_df [4,119 x 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:4119] 30 39 25 38 47 32 32 41 31 35 ...
## $ job : chr [1:4119] "blue-collar" "services" "services" "services" ...
## $ marital : chr [1:4119] "married" "single" "married" "married" ...
## $ education : chr [1:4119] "basic.9y" "high.school" "high.school" "basic.9y" ...
## $ default : chr [1:4119] "no" "no" "no" "no" ...
## $ housing : chr [1:4119] "yes" "no" "yes" "unknown" ...
## $ loan : chr [1:4119] "no" "no" "no" "unknown" ...
## $ contact : chr [1:4119] "cellular" "telephone" "telephone" "telephone" ...
## $ month : chr [1:4119] "may" "may" "jun" "jun" ...
## $ day_of_week : chr [1:4119] "fri" "fri" "wed" "fri" ...
## $ duration : num [1:4119] 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : num [1:4119] 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : num [1:4119] 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : num [1:4119] 0 0 0 0 0 2 0 0 1 0 ...
## $ poutcome : chr [1:4119] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num [1:4119] -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num [1:4119] 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num [1:4119] -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num [1:4119] 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num [1:4119] 5099 5191 5228 5228 5196 ...
## $ y : chr [1:4119] "no" "no" "no" "no" ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. job = col_character(),
## .. marital = col_character(),
## .. education = col_character(),
## .. default = col_character(),
## .. housing = col_character(),
## .. loan = col_character(),
## .. contact = col_character(),
## .. month = col_character(),
## .. day_of_week = col_character(),
## .. duration = col_double(),
## .. campaign = col_double(),
## .. pdays = col_double(),
## .. previous = col_double(),
## .. poutcome = col_character(),
## .. emp.var.rate = col_double(),
## .. cons.price.idx = col_double(),
## .. cons.conf.idx = col_double(),
## .. euribor3m = col_double(),
## .. nr.employed = col_double(),
## .. y = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
bank$job = as.factor(bank$job)
bank$education = as.factor(bank$education)
bank$marital = as.factor(bank$marital)
bank$default = as.factor(bank$default)
bank$housing = as.factor(bank$housing)
bank$loan = as.factor(bank$loan)
bank$contact = as.factor(bank$contact)
bank$month = as.factor(bank$month)
bank$day_of_week = as.factor(bank$day_of_week)
bank$poutcome = as.factor(bank$poutcome)
bank$y = as.factor(bank$y)
summary(bank)
## age job marital education
## Min. :18.00 admin. :1012 divorced: 446 university.degree :1264
## 1st Qu.:32.00 blue-collar: 884 married :2509 high.school : 921
## Median :38.00 technician : 691 single :1153 basic.9y : 574
## Mean :40.11 services : 393 unknown : 11 professional.course: 535
## 3rd Qu.:47.00 management : 324 basic.4y : 429
## Max. :88.00 retired : 166 basic.6y : 228
## (Other) : 649 (Other) : 168
## default housing loan contact month
## no :3315 no :1839 no :3349 cellular :2652 may :1378
## unknown: 803 unknown: 105 unknown: 105 telephone:1467 jul : 711
## yes : 1 yes :2175 yes : 665 aug : 636
## jun : 530
## nov : 446
## apr : 215
## (Other): 203
## day_of_week duration campaign pdays previous
## fri:768 Min. : 0.0 Min. : 1.000 Min. : 0.0 Min. :0.0000
## mon:855 1st Qu.: 103.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000
## thu:860 Median : 181.0 Median : 2.000 Median :999.0 Median :0.0000
## tue:841 Mean : 256.8 Mean : 2.537 Mean :960.4 Mean :0.1903
## wed:795 3rd Qu.: 317.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :3643.0 Max. :35.000 Max. :999.0 Max. :6.0000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure : 454 Min. :-3.40000 Min. :92.20 Min. :-50.8
## nonexistent:3523 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7
## success : 142 Median : 1.10000 Median :93.75 Median :-41.8
## Mean : 0.08497 Mean :93.58 Mean :-40.5
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4
## Max. : 1.40000 Max. :94.77 Max. :-26.9
##
## euribor3m nr.employed y
## Min. :0.635 Min. :4964 no :3668
## 1st Qu.:1.334 1st Qu.:5099 yes: 451
## Median :4.857 Median :5191
## Mean :3.621 Mean :5166
## 3rd Qu.:4.961 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
sum(duplicated(bank))
## [1] 0
There are no duplicates in the dataset.
sum(is.na(bank$age))
## [1] 0
sum(is.na(bank$campaign))
## [1] 0
sum(is.na(bank$pdays))
## [1] 0
sum(is.na(bank$previous))
## [1] 0
sum(is.na(bank$emp.var.rate))
## [1] 0
sum(is.na(bank$cons.price.idx))
## [1] 0
sum(is.na(bank$cons.conf.idx))
## [1] 0
sum(is.na(bank$nr.employed))
## [1] 0
There are no missing values found.
ggplot(bank,aes(job))+geom_bar(aes(fill= y),position = position_dodge())+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Analysis: Admins are the largest group who subscribed to the term deposit and this might be due admins are the largest group in the distribution.
ggplot(bank,aes(marital))+geom_bar(aes(fill= y),position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Analysis: Keep variable
ggplot(bank,aes(education))+geom_bar(aes(fill= y), position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Analysis: Keep variable
ggplot(bank,aes(x = y,y = age))+geom_boxplot(aes(fill= y))+xlab("Subscribed")
Analysis: We can observe that age is not so good predictor for the whether customer subscribed term deposit, since the mean is almost same. We recommend NOT TO KEEP IT in the model.
ggplot(bank,aes(day_of_week))+geom_bar(aes(fill= y), position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Analysis: We observe that the number of customers subscribed are fairly equal among all the days of the week.Hence this might not be a good predictor for our response variable.
ggplot(bank,aes(month))+geom_bar(aes(fill= y), position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Analysis: From the plots, we observe that May has highest number of subscribers and this would be a good predictor
ggplot(bank,aes(housing))+geom_bar(aes(fill= y), position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Analysis: From the plots, we observe that there is no significance of housing on the number of subscribers
CrossTable(bank$job, bank$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$job | no | yes | Row Total |
## --------------|-----------|-----------|-----------|
## admin. | 879 | 133 | 1012 |
## | 0.547 | 4.445 | |
## | 0.869 | 0.131 | 0.246 |
## | 0.240 | 0.295 | |
## | 0.213 | 0.032 | |
## --------------|-----------|-----------|-----------|
## blue-collar | 823 | 61 | 884 |
## | 1.627 | 13.235 | |
## | 0.931 | 0.069 | 0.215 |
## | 0.224 | 0.135 | |
## | 0.200 | 0.015 | |
## --------------|-----------|-----------|-----------|
## entrepreneur | 140 | 8 | 148 |
## | 0.511 | 4.154 | |
## | 0.946 | 0.054 | 0.036 |
## | 0.038 | 0.018 | |
## | 0.034 | 0.002 | |
## --------------|-----------|-----------|-----------|
## housemaid | 99 | 11 | 110 |
## | 0.011 | 0.091 | |
## | 0.900 | 0.100 | 0.027 |
## | 0.027 | 0.024 | |
## | 0.024 | 0.003 | |
## --------------|-----------|-----------|-----------|
## management | 294 | 30 | 324 |
## | 0.104 | 0.845 | |
## | 0.907 | 0.093 | 0.079 |
## | 0.080 | 0.067 | |
## | 0.071 | 0.007 | |
## --------------|-----------|-----------|-----------|
## retired | 128 | 38 | 166 |
## | 2.659 | 21.622 | |
## | 0.771 | 0.229 | 0.040 |
## | 0.035 | 0.084 | |
## | 0.031 | 0.009 | |
## --------------|-----------|-----------|-----------|
## self-employed | 146 | 13 | 159 |
## | 0.137 | 1.117 | |
## | 0.918 | 0.082 | 0.039 |
## | 0.040 | 0.029 | |
## | 0.035 | 0.003 | |
## --------------|-----------|-----------|-----------|
## services | 358 | 35 | 393 |
## | 0.184 | 1.499 | |
## | 0.911 | 0.089 | 0.095 |
## | 0.098 | 0.078 | |
## | 0.087 | 0.008 | |
## --------------|-----------|-----------|-----------|
## student | 63 | 19 | 82 |
## | 1.375 | 11.186 | |
## | 0.768 | 0.232 | 0.020 |
## | 0.017 | 0.042 | |
## | 0.015 | 0.005 | |
## --------------|-----------|-----------|-----------|
## technician | 611 | 80 | 691 |
## | 0.031 | 0.249 | |
## | 0.884 | 0.116 | 0.168 |
## | 0.167 | 0.177 | |
## | 0.148 | 0.019 | |
## --------------|-----------|-----------|-----------|
## unemployed | 92 | 19 | 111 |
## | 0.474 | 3.857 | |
## | 0.829 | 0.171 | 0.027 |
## | 0.025 | 0.042 | |
## | 0.022 | 0.005 | |
## --------------|-----------|-----------|-----------|
## unknown | 35 | 4 | 39 |
## | 0.002 | 0.017 | |
## | 0.897 | 0.103 | 0.009 |
## | 0.010 | 0.009 | |
## | 0.008 | 0.001 | |
## --------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## | 0.891 | 0.109 | |
## --------------|-----------|-----------|-----------|
##
##
Analysis: We will be using a 95% critical value, with a df = 11 (12 levels with the unknown - 1 per formula). Critical Value = 4.575 (from Chi-square table). With the cross-table we actually saw that most of the levels don’t get passed the critical value, which means that they are significant.
Recommendation to remove “unknown”: Even when the chi-square contribution of “unknown” doesn’t get passed the critical value, which makes it significant, this is a variable that has only 39 observations, only 4 “yes” out of 451 of the variable, so we recommend to remove them.
CrossTable(bank$marital, bank$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$marital | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## divorced | 403 | 43 | 446 |
## | 0.086 | 0.697 | |
## | 0.904 | 0.096 | 0.108 |
## | 0.110 | 0.095 | |
## | 0.098 | 0.010 | |
## -------------|-----------|-----------|-----------|
## married | 2257 | 252 | 2509 |
## | 0.231 | 1.879 | |
## | 0.900 | 0.100 | 0.609 |
## | 0.615 | 0.559 | |
## | 0.548 | 0.061 | |
## -------------|-----------|-----------|-----------|
## single | 998 | 155 | 1153 |
## | 0.805 | 6.550 | |
## | 0.866 | 0.134 | 0.280 |
## | 0.272 | 0.344 | |
## | 0.242 | 0.038 | |
## -------------|-----------|-----------|-----------|
## unknown | 10 | 1 | 11 |
## | 0.004 | 0.035 | |
## | 0.909 | 0.091 | 0.003 |
## | 0.003 | 0.002 | |
## | 0.002 | 0.000 | |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## | 0.891 | 0.109 | |
## -------------|-----------|-----------|-----------|
##
##
Analysis and recommendation: Same as “job” , the “unknown” category for “marital” has very few observations (10 = no, and 1 = yes), we recommend to remove it.
CrossTable(bank$education, bank$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$education | no | yes | Row Total |
## --------------------|-----------|-----------|-----------|
## basic.4y | 391 | 38 | 429 |
## | 0.211 | 1.714 | |
## | 0.911 | 0.089 | 0.104 |
## | 0.107 | 0.084 | |
## | 0.095 | 0.009 | |
## --------------------|-----------|-----------|-----------|
## basic.6y | 211 | 17 | 228 |
## | 0.312 | 2.541 | |
## | 0.925 | 0.075 | 0.055 |
## | 0.058 | 0.038 | |
## | 0.051 | 0.004 | |
## --------------------|-----------|-----------|-----------|
## basic.9y | 531 | 43 | 574 |
## | 0.771 | 6.269 | |
## | 0.925 | 0.075 | 0.139 |
## | 0.145 | 0.095 | |
## | 0.129 | 0.010 | |
## --------------------|-----------|-----------|-----------|
## high.school | 824 | 97 | 921 |
## | 0.018 | 0.146 | |
## | 0.895 | 0.105 | 0.224 |
## | 0.225 | 0.215 | |
## | 0.200 | 0.024 | |
## --------------------|-----------|-----------|-----------|
## illiterate | 1 | 0 | 1 |
## | 0.013 | 0.109 | |
## | 1.000 | 0.000 | 0.000 |
## | 0.000 | 0.000 | |
## | 0.000 | 0.000 | |
## --------------------|-----------|-----------|-----------|
## professional.course | 470 | 65 | 535 |
## | 0.087 | 0.704 | |
## | 0.879 | 0.121 | 0.130 |
## | 0.128 | 0.144 | |
## | 0.114 | 0.016 | |
## --------------------|-----------|-----------|-----------|
## university.degree | 1099 | 165 | 1264 |
## | 0.629 | 5.113 | |
## | 0.869 | 0.131 | 0.307 |
## | 0.300 | 0.366 | |
## | 0.267 | 0.040 | |
## --------------------|-----------|-----------|-----------|
## unknown | 141 | 26 | 167 |
## | 0.400 | 3.255 | |
## | 0.844 | 0.156 | 0.041 |
## | 0.038 | 0.058 | |
## | 0.034 | 0.006 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## | 0.891 | 0.109 | |
## --------------------|-----------|-----------|-----------|
##
##
chisq.test(bank$education, bank$y)
##
## Pearson's Chi-squared test
##
## data: bank$education and bank$y
## X-squared = 22.292, df = 7, p-value = 0.002262
Analysis and recommendation: The variable “Education” is significant (basing our decision on Ho: m1 = m2 = m3 = m4 …, Ha: at least one of the groups has a different mean), removing the “unknown” might be a good option, even when the chi-square contribution is as high as other categories.
CrossTable(bank$housing, bank$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$housing | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## no | 1637 | 202 | 1839 |
## | 0.000 | 0.002 | |
## | 0.890 | 0.110 | 0.446 |
## | 0.446 | 0.448 | |
## | 0.397 | 0.049 | |
## -------------|-----------|-----------|-----------|
## unknown | 96 | 9 | 105 |
## | 0.067 | 0.542 | |
## | 0.914 | 0.086 | 0.025 |
## | 0.026 | 0.020 | |
## | 0.023 | 0.002 | |
## -------------|-----------|-----------|-----------|
## yes | 1935 | 240 | 2175 |
## | 0.002 | 0.014 | |
## | 0.890 | 0.110 | 0.528 |
## | 0.528 | 0.532 | |
## | 0.470 | 0.058 | |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## | 0.891 | 0.109 | |
## -------------|-----------|-----------|-----------|
##
##
chisq.test(bank$housing, bank$y)
##
## Pearson's Chi-squared test
##
## data: bank$housing and bank$y
## X-squared = 0.62738, df = 2, p-value = 0.7307
Analysis and recommendation: The variable “Housing” is not significant (basing our decision on Ho: m1 = m2 = m3 = m4 …, Ha: at least one of the groups has a different mean). We recommend NOT TO USE it in the model.
CrossTable(bank$loan, bank$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$loan | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## no | 2975 | 374 | 3349 |
## | 0.018 | 0.146 | |
## | 0.888 | 0.112 | 0.813 |
## | 0.811 | 0.829 | |
## | 0.722 | 0.091 | |
## -------------|-----------|-----------|-----------|
## unknown | 96 | 9 | 105 |
## | 0.067 | 0.542 | |
## | 0.914 | 0.086 | 0.025 |
## | 0.026 | 0.020 | |
## | 0.023 | 0.002 | |
## -------------|-----------|-----------|-----------|
## yes | 597 | 68 | 665 |
## | 0.039 | 0.318 | |
## | 0.898 | 0.102 | 0.161 |
## | 0.163 | 0.151 | |
## | 0.145 | 0.017 | |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## | 0.891 | 0.109 | |
## -------------|-----------|-----------|-----------|
##
##
chisq.test(bank$loan, bank$y)
##
## Pearson's Chi-squared test
##
## data: bank$loan and bank$y
## X-squared = 1.1297, df = 2, p-value = 0.5684
Analysis and recommendation: The variable “Loan” is not significant (basing our decision on Ho: m1 = m2 = m3 = m4 …, Ha: at least one of the groups has a different mean). We recommend NOT TO USE it in the model.
table(bank$pdays)
##
## 0 1 2 3 4 5 6 7 9 10 11 12 13 14 15 16
## 2 3 4 52 14 4 42 10 3 8 1 5 2 1 2 2
## 17 18 19 21 999
## 1 2 1 1 3959
We saw that we had 3959 observations with the value of “999”, that means not ever contacted before, so we decided to recode the variable as a binary variable, where “999” = 0 (not contacted before), and all the other observations would be coded = 1 (contacted).
Just to verify that we did our recoding correctly, and see the final distribution.
bank$pdays_1<-ifelse(bank$pdays==999,0,1)
table(bank$pdays_1)
##
## 0 1
## 3959 160
CrossTable(bank$pdays_1, bank$y)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$pdays_1 | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 3608 | 351 | 3959 |
## | 1.930 | 15.694 | |
## | 0.911 | 0.089 | 0.961 |
## | 0.984 | 0.778 | |
## | 0.876 | 0.085 | |
## -------------|-----------|-----------|-----------|
## 1 | 60 | 100 | 160 |
## | 47.748 | 388.334 | |
## | 0.375 | 0.625 | 0.039 |
## | 0.016 | 0.222 | |
## | 0.015 | 0.024 | |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## | 0.891 | 0.109 | |
## -------------|-----------|-----------|-----------|
##
##
chisq.test(bank$pdays_1, bank$y)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: bank$pdays_1 and bank$y
## X-squared = 448.22, df = 1, p-value < 2.2e-16
Analysis: Here we see a “perfect separation” in pdays similar as “Duration”. The reason is exactly the same as the one with “Duration”.
plot(bank$y)
Analysis: There are more number of “no” compared to “yes”, so the data clearly is imbalanced
pairs1 = bank[, c("y","age","duration","campaign","previous","previous",
"emp.var.rate","cons.price.idx","cons.conf.idx")]
pairs(pairs1)
pairs2 = bank[, c("y","job","marital","education","default","housing","loan",
"contact","month","day_of_week","poutcome","pdays_1")]
pairs(pairs2)
Analysis: None of the variables seem to be correlated from the above plots.
First step is to create a copy dataset, to start removing those rows.
bank_clean <- bank
We will remove the “Unknown” Category from the following Factor Variables: Job, Education, Marital.
bank_clean <- subset(bank_clean, bank_clean$job != "unknown")
From 4119 observations 4080 (39 for Job)
bank_clean <- subset(bank_clean, bank_clean$education != "unknown")
From 4080 observations to 3926 (167 for Education)
bank_clean <- subset(bank_clean, bank_clean$marital != "unknown")
From 3926 observations to 3915 (11 for Marital)
bank_clean = subset(bank_clean, select = -c(age, duration, default, day_of_week, housing, loan, pdays))
table(bank$y)
##
## no yes
## 3668 451
table(bank_clean$y)
##
## no yes
## 3493 422
Analysis: We can see from the table that the dataset is not balanced, it is more skewed towards “no”
Model_1 = glm(data = bank, formula = y ~ .-pdays, family = binomial)
summary(Model_1)
##
## Call:
## glm(formula = y ~ . - pdays, family = binomial, data = bank)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.9381 -0.2840 -0.1750 -0.1135 2.8050
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.452e+02 1.228e+02 -1.182 0.237231
## age 8.665e-03 8.137e-03 1.065 0.286955
## jobblue-collar -2.270e-01 2.706e-01 -0.839 0.401439
## jobentrepreneur -7.740e-01 4.988e-01 -1.552 0.120700
## jobhousemaid 2.476e-01 4.462e-01 0.555 0.578923
## jobmanagement -3.135e-01 2.854e-01 -1.099 0.271921
## jobretired -2.065e-01 3.464e-01 -0.596 0.551034
## jobself-employed -7.305e-01 4.163e-01 -1.755 0.079318 .
## jobservices 1.248e-01 2.821e-01 0.442 0.658172
## jobstudent -7.303e-02 3.981e-01 -0.183 0.854460
## jobtechnician 2.209e-01 2.243e-01 0.985 0.324641
## jobunemployed 3.363e-01 3.912e-01 0.860 0.389974
## jobunknown -4.622e-01 7.254e-01 -0.637 0.523949
## maritalmarried 2.781e-01 2.457e-01 1.132 0.257592
## maritalsingle 3.275e-01 2.797e-01 1.171 0.241652
## maritalunknown 2.767e-01 1.145e+00 0.242 0.808961
## educationbasic.6y 3.220e-01 4.077e-01 0.790 0.429662
## educationbasic.9y 2.012e-01 3.214e-01 0.626 0.531376
## educationhigh.school 1.516e-01 3.066e-01 0.494 0.620998
## educationilliterate -1.144e+01 5.354e+02 -0.021 0.982946
## educationprofessional.course 1.168e-01 3.355e-01 0.348 0.727827
## educationuniversity.degree 3.187e-01 3.088e-01 1.032 0.301945
## educationunknown 2.424e-01 3.879e-01 0.625 0.532044
## defaultunknown 1.452e-01 2.099e-01 0.692 0.488982
## defaultyes -8.819e+00 5.354e+02 -0.016 0.986858
## housingunknown -5.595e-01 5.219e-01 -1.072 0.283765
## housingyes -6.737e-02 1.376e-01 -0.490 0.624360
## loanunknown NA NA NA NA
## loanyes -1.145e-01 1.870e-01 -0.613 0.540196
## contacttelephone -9.556e-01 2.784e-01 -3.432 0.000599 ***
## monthaug 4.943e-01 4.140e-01 1.194 0.232468
## monthdec 8.874e-01 6.760e-01 1.313 0.189255
## monthjul 9.378e-02 3.617e-01 0.259 0.795414
## monthjun 5.521e-01 4.343e-01 1.271 0.203680
## monthmar 2.474e+00 5.182e-01 4.775 1.8e-06 ***
## monthmay -3.229e-01 3.007e-01 -1.074 0.282919
## monthnov -3.241e-01 4.188e-01 -0.774 0.438975
## monthoct 2.819e-01 5.276e-01 0.534 0.593219
## monthsep 1.550e-01 5.965e-01 0.260 0.795025
## day_of_weekmon 1.426e-01 2.130e-01 0.669 0.503244
## day_of_weekthu 1.023e-01 2.153e-01 0.475 0.634645
## day_of_weektue 4.485e-02 2.189e-01 0.205 0.837645
## day_of_weekwed 3.127e-01 2.222e-01 1.407 0.159371
## duration 5.260e-03 2.608e-04 20.167 < 2e-16 ***
## campaign -9.987e-02 4.591e-02 -2.175 0.029610 *
## previous 1.229e-01 1.714e-01 0.717 0.473384
## poutcomenonexistent 5.801e-01 2.950e-01 1.966 0.049255 *
## poutcomesuccess 1.343e+00 6.327e-01 2.123 0.033741 *
## emp.var.rate -8.651e-01 4.651e-01 -1.860 0.062889 .
## cons.price.idx 1.408e+00 8.109e-01 1.737 0.082414 .
## cons.conf.idx 6.440e-02 2.633e-02 2.446 0.014452 *
## euribor3m -1.610e-01 4.172e-01 -0.386 0.699657
## nr.employed 2.153e-03 9.924e-03 0.217 0.828263
## pdays_1 4.752e-01 6.382e-01 0.745 0.456520
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2845.8 on 4118 degrees of freedom
## Residual deviance: 1597.7 on 4066 degrees of freedom
## AIC: 1703.7
##
## Number of Fisher Scoring iterations: 12
Analysis: “Duration” attribute highly affects the output target (e.g., if duration=0 then y=‘no’). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. This leads to “perfect separation problem” thus, this input should be discarded if the intention is to have a realistic predictive model.
set.seed(1)
row.number = sample(1:nrow(bank_clean), 0.8*nrow(bank_clean))
train_bank = bank_clean[row.number,]
test_bank = bank_clean[-row.number,]
Model_2 = glm(formula = y ~ ., data = train_bank, family = binomial)
summary(Model_2)
##
## Call:
## glm(formula = y ~ ., family = binomial, data = train_bank)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8982 -0.3985 -0.3319 -0.2632 2.9531
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.392e+02 1.191e+02 -2.008 0.04462 *
## jobblue-collar -1.755e-01 2.587e-01 -0.679 0.49743
## jobentrepreneur -3.715e-01 4.598e-01 -0.808 0.41910
## jobhousemaid -5.622e-02 4.451e-01 -0.126 0.89947
## jobmanagement -4.907e-01 2.893e-01 -1.696 0.08987 .
## jobretired 5.637e-02 3.085e-01 0.183 0.85503
## jobself-employed -4.570e-01 3.896e-01 -1.173 0.24074
## jobservices -2.736e-01 2.792e-01 -0.980 0.32721
## jobstudent 2.102e-01 4.137e-01 0.508 0.61127
## jobtechnician 5.928e-02 2.223e-01 0.267 0.78977
## jobunemployed 3.578e-01 3.601e-01 0.994 0.32045
## maritalmarried -7.156e-03 2.194e-01 -0.033 0.97399
## maritalsingle -2.181e-02 2.407e-01 -0.091 0.92779
## educationbasic.6y 8.546e-02 3.743e-01 0.228 0.81937
## educationbasic.9y 1.033e-01 2.955e-01 0.349 0.72679
## educationhigh.school -1.344e-02 2.839e-01 -0.047 0.96226
## educationilliterate -1.141e+01 3.247e+02 -0.035 0.97197
## educationprofessional.course 2.484e-02 3.105e-01 0.080 0.93623
## educationuniversity.degree 1.651e-01 2.859e-01 0.577 0.56373
## contacttelephone -1.117e+00 2.799e-01 -3.992 6.56e-05 ***
## monthaug -5.640e-05 4.178e-01 0.000 0.99989
## monthdec 7.167e-01 6.885e-01 1.041 0.29785
## monthjul -1.310e-01 3.468e-01 -0.378 0.70561
## monthjun -1.937e-01 4.202e-01 -0.461 0.64485
## monthmar 2.101e+00 5.125e-01 4.099 4.15e-05 ***
## monthmay -2.602e-01 2.945e-01 -0.883 0.37702
## monthnov -4.898e-01 4.007e-01 -1.222 0.22160
## monthoct 2.268e-01 5.071e-01 0.447 0.65466
## monthsep -3.731e-01 6.149e-01 -0.607 0.54400
## campaign -5.033e-02 3.638e-02 -1.384 0.16647
## previous 1.760e-01 1.842e-01 0.955 0.33935
## poutcomenonexistent 5.893e-01 3.079e-01 1.914 0.05566 .
## poutcomesuccess 1.310e+00 6.638e-01 1.974 0.04839 *
## emp.var.rate -1.236e+00 4.539e-01 -2.723 0.00647 **
## cons.price.idx 2.004e+00 7.883e-01 2.542 0.01102 *
## cons.conf.idx 6.686e-02 2.639e-02 2.534 0.01128 *
## euribor3m -5.715e-02 4.180e-01 -0.137 0.89123
## nr.employed 1.013e-02 9.638e-03 1.051 0.29336
## pdays_1 6.096e-01 6.623e-01 0.920 0.35734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2147.4 on 3131 degrees of freedom
## Residual deviance: 1689.9 on 3093 degrees of freedom
## AIC: 1767.9
##
## Number of Fisher Scoring iterations: 11
vif(Model_2)
## GVIF Df GVIF^(1/(2*Df))
## job 4.265087 10 1.075218
## marital 1.211676 2 1.049172
## education 3.146399 6 1.100233
## contact 3.110746 1 1.763731
## month 83.753461 9 1.278891
## campaign 1.056256 1 1.027743
## previous 4.319643 1 2.078375
## poutcome 20.444817 2 2.126404
## emp.var.rate 145.429495 1 12.059415
## cons.price.idx 64.240868 1 8.015040
## cons.conf.idx 5.367253 1 2.316733
## euribor3m 142.371477 1 11.931952
## nr.employed 168.653073 1 12.986650
## pdays_1 8.767318 1 2.960966
Analysis: We will run a stepwise removal of the variables, by doing the AIC
glm.null.train_bank = glm(y ~ 1, data = train_bank, family = "binomial")
glm.full.train_bank = glm(y ~ .,
data = train_bank, family = "binomial")
step.AIC1 = step(glm.null.train_bank, scope = list(upper=glm.full.train_bank),
direction ="both",test ="Chisq", trace = F)
step.AIC1
##
## Call: glm(formula = y ~ nr.employed + month + poutcome + contact +
## cons.conf.idx + campaign, family = "binomial", data = train_bank)
##
## Coefficients:
## (Intercept) nr.employed monthaug
## 47.238245 -0.009203 -0.438389
## monthdec monthjul monthjun
## 0.386576 0.038092 0.670823
## monthmar monthmay monthnov
## 1.589938 -0.541502 -0.518121
## monthoct monthsep poutcomenonexistent
## -0.029950 -1.021362 0.292429
## poutcomesuccess contacttelephone cons.conf.idx
## 1.913730 -0.817545 0.045876
## campaign
## -0.052587
##
## Degrees of Freedom: 3131 Total (i.e. Null); 3116 Residual
## Null Deviance: 2147
## Residual Deviance: 1710 AIC: 1742
summary(step.AIC1)
##
## Call:
## glm(formula = y ~ nr.employed + month + poutcome + contact +
## cons.conf.idx + campaign, family = "binomial", data = train_bank)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0646 -0.3861 -0.3460 -0.2592 2.7320
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 47.238245 5.338645 8.848 < 2e-16 ***
## nr.employed -0.009203 0.001062 -8.667 < 2e-16 ***
## monthaug -0.438389 0.341300 -1.284 0.198979
## monthdec 0.386576 0.641209 0.603 0.546584
## monthjul 0.038092 0.308172 0.124 0.901627
## monthjun 0.670823 0.304145 2.206 0.027412 *
## monthmar 1.589938 0.423864 3.751 0.000176 ***
## monthmay -0.541502 0.266508 -2.032 0.042170 *
## monthnov -0.518121 0.312337 -1.659 0.097146 .
## monthoct -0.029950 0.418612 -0.072 0.942964
## monthsep -1.021362 0.473896 -2.155 0.031143 *
## poutcomenonexistent 0.292429 0.197733 1.479 0.139166
## poutcomesuccess 1.913730 0.289441 6.612 3.80e-11 ***
## contacttelephone -0.817545 0.205408 -3.980 6.89e-05 ***
## cons.conf.idx 0.045876 0.015380 2.983 0.002855 **
## campaign -0.052587 0.036262 -1.450 0.147005
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2147.4 on 3131 degrees of freedom
## Residual deviance: 1709.9 on 3116 degrees of freedom
## AIC: 1741.9
##
## Number of Fisher Scoring iterations: 6
hoslem.test(step.AIC1$y, fitted(step.AIC1), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: step.AIC1$y, fitted(step.AIC1)
## X-squared = 4.2476, df = 8, p-value = 0.8341
Analysis: Since the Hosmer-Lemeshow test yielded a large p-value of 0.8341, we do not reject our null hypothesis. This means that the model is adequate
test_bank$PredProb = predict.glm(step.AIC1, newdata = test_bank, type = "response")
test_bank$PredSub = ifelse(test_bank$PredProb >= 0.5, "yes", "no")
table(test_bank$PredSub)
##
## no yes
## 751 32
caret::confusionMatrix(test_bank$y,as.factor(test_bank$PredSub))
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 688 12
## yes 63 20
##
## Accuracy : 0.9042
## 95% CI : (0.8814, 0.9239)
## No Information Rate : 0.9591
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3069
##
## Mcnemar's Test P-Value : 7.764e-09
##
## Sensitivity : 0.9161
## Specificity : 0.6250
## Pos Pred Value : 0.9829
## Neg Pred Value : 0.2410
## Prevalence : 0.9591
## Detection Rate : 0.8787
## Detection Prevalence : 0.8940
## Balanced Accuracy : 0.7706
##
## 'Positive' Class : no
##
PredProb1 = prediction(predict.glm(step.AIC1, newdata = test_bank, type = "response"), test_bank$y)
# Computing threshold for cutoff to best trade off sensitivity and specificity
plot(unlist(performance(PredProb1,'sens')@x.values),unlist(performance(PredProb1,'sens')@y.values), type='l', lwd=2, ylab = "", xlab = 'Cutoff')
mtext('Sensitivity',side=2)
mtext('Sensitivity vs. Specificity Plot for AIC Model', side=3)
# Second specificity in same plot
par(new=TRUE)
plot(unlist(performance(PredProb1,'spec')@x.values),unlist(performance(PredProb1,'spec')@y.values), type='l', lwd=2,col='red', ylab = "", xlab = 'Cutoff')
axis(4,at=seq(0,1,0.2))
mtext('Specificity',side=4, col='red')
par(new=TRUE)
min.diff <-which.min(abs(unlist(performance(PredProb1, "sens")@y.values) - unlist(performance(PredProb1, "spec")@y.values)))
min.x<-unlist(performance(PredProb1, "sens")@x.values)[min.diff]
min.y<-unlist(performance(PredProb1, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
test_bank$PredSubOptimal = ifelse(test_bank$PredProb >= 0.08, "yes", "no")
table(test_bank$PredSubOptimal)
##
## no yes
## 537 246
caret::confusionMatrix(test_bank$y,as.factor(test_bank$PredSubOptimal))
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 516 184
## yes 21 62
##
## Accuracy : 0.7382
## 95% CI : (0.7059, 0.7687)
## No Information Rate : 0.6858
## P-Value [Acc > NIR] : 0.0007765
##
## Kappa : 0.2595
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9609
## Specificity : 0.2520
## Pos Pred Value : 0.7371
## Neg Pred Value : 0.7470
## Prevalence : 0.6858
## Detection Rate : 0.6590
## Detection Prevalence : 0.8940
## Balanced Accuracy : 0.6065
##
## 'Positive' Class : no
##
Analysis: Even though the accuracy decreases, the optimal cutoff gives us a higher true subscription values. Hence it is important to have the optimal cutoff instead of the arbitrary (0.5) cutoff.