Load the file

bank <- read_delim("bank-additional.csv", 
                              delim = ";", escape_double = FALSE, trim_ws = TRUE)
## Rows: 4119 Columns: 21
## -- Column specification --------------------------------------------------------
## Delimiter: ";"
## chr (11): job, marital, education, default, housing, loan, contact, month, d...
## dbl (10): age, duration, campaign, pdays, previous, emp.var.rate, cons.price...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.



Overview of the dataset

str(bank)
## spec_tbl_df [4,119 x 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age           : num [1:4119] 30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : chr [1:4119] "blue-collar" "services" "services" "services" ...
##  $ marital       : chr [1:4119] "married" "single" "married" "married" ...
##  $ education     : chr [1:4119] "basic.9y" "high.school" "high.school" "basic.9y" ...
##  $ default       : chr [1:4119] "no" "no" "no" "no" ...
##  $ housing       : chr [1:4119] "yes" "no" "yes" "unknown" ...
##  $ loan          : chr [1:4119] "no" "no" "no" "unknown" ...
##  $ contact       : chr [1:4119] "cellular" "telephone" "telephone" "telephone" ...
##  $ month         : chr [1:4119] "may" "may" "jun" "jun" ...
##  $ day_of_week   : chr [1:4119] "fri" "fri" "wed" "fri" ...
##  $ duration      : num [1:4119] 487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : num [1:4119] 2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : num [1:4119] 999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : num [1:4119] 0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : chr [1:4119] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num [1:4119] -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num [1:4119] 92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num [1:4119] -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num [1:4119] 1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num [1:4119] 5099 5191 5228 5228 5196 ...
##  $ y             : chr [1:4119] "no" "no" "no" "no" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   job = col_character(),
##   ..   marital = col_character(),
##   ..   education = col_character(),
##   ..   default = col_character(),
##   ..   housing = col_character(),
##   ..   loan = col_character(),
##   ..   contact = col_character(),
##   ..   month = col_character(),
##   ..   day_of_week = col_character(),
##   ..   duration = col_double(),
##   ..   campaign = col_double(),
##   ..   pdays = col_double(),
##   ..   previous = col_double(),
##   ..   poutcome = col_character(),
##   ..   emp.var.rate = col_double(),
##   ..   cons.price.idx = col_double(),
##   ..   cons.conf.idx = col_double(),
##   ..   euribor3m = col_double(),
##   ..   nr.employed = col_double(),
##   ..   y = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>



Data Cleaning and Preparation

1) Character Variables to Factor Variables

bank$job = as.factor(bank$job)
bank$education = as.factor(bank$education)
bank$marital = as.factor(bank$marital)
bank$default = as.factor(bank$default)
bank$housing = as.factor(bank$housing)
bank$loan = as.factor(bank$loan)
bank$contact = as.factor(bank$contact)
bank$month = as.factor(bank$month)
bank$day_of_week = as.factor(bank$day_of_week)
bank$poutcome = as.factor(bank$poutcome)
bank$y = as.factor(bank$y)



2) Summary: Looking for odd patterns

summary(bank)
##       age                 job           marital                   education   
##  Min.   :18.00   admin.     :1012   divorced: 446   university.degree  :1264  
##  1st Qu.:32.00   blue-collar: 884   married :2509   high.school        : 921  
##  Median :38.00   technician : 691   single  :1153   basic.9y           : 574  
##  Mean   :40.11   services   : 393   unknown :  11   professional.course: 535  
##  3rd Qu.:47.00   management : 324                   basic.4y           : 429  
##  Max.   :88.00   retired    : 166                   basic.6y           : 228  
##                  (Other)    : 649                   (Other)            : 168  
##     default        housing          loan           contact         month     
##  no     :3315   no     :1839   no     :3349   cellular :2652   may    :1378  
##  unknown: 803   unknown: 105   unknown: 105   telephone:1467   jul    : 711  
##  yes    :   1   yes    :2175   yes    : 665                    aug    : 636  
##                                                                jun    : 530  
##                                                                nov    : 446  
##                                                                apr    : 215  
##                                                                (Other): 203  
##  day_of_week    duration         campaign          pdays          previous     
##  fri:768     Min.   :   0.0   Min.   : 1.000   Min.   :  0.0   Min.   :0.0000  
##  mon:855     1st Qu.: 103.0   1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000  
##  thu:860     Median : 181.0   Median : 2.000   Median :999.0   Median :0.0000  
##  tue:841     Mean   : 256.8   Mean   : 2.537   Mean   :960.4   Mean   :0.1903  
##  wed:795     3rd Qu.: 317.0   3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000  
##              Max.   :3643.0   Max.   :35.000   Max.   :999.0   Max.   :6.0000  
##                                                                                
##         poutcome     emp.var.rate      cons.price.idx  cons.conf.idx  
##  failure    : 454   Min.   :-3.40000   Min.   :92.20   Min.   :-50.8  
##  nonexistent:3523   1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:-42.7  
##  success    : 142   Median : 1.10000   Median :93.75   Median :-41.8  
##                     Mean   : 0.08497   Mean   :93.58   Mean   :-40.5  
##                     3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:-36.4  
##                     Max.   : 1.40000   Max.   :94.77   Max.   :-26.9  
##                                                                       
##    euribor3m      nr.employed     y       
##  Min.   :0.635   Min.   :4964   no :3668  
##  1st Qu.:1.334   1st Qu.:5099   yes: 451  
##  Median :4.857   Median :5191             
##  Mean   :3.621   Mean   :5166             
##  3rd Qu.:4.961   3rd Qu.:5228             
##  Max.   :5.045   Max.   :5228             
## 



3) Finding Duplicates

sum(duplicated(bank))
## [1] 0

There are no duplicates in the dataset.



4) Verify numeric variables with NAs

sum(is.na(bank$age))
## [1] 0
sum(is.na(bank$campaign))
## [1] 0
sum(is.na(bank$pdays))
## [1] 0
sum(is.na(bank$previous))
## [1] 0
sum(is.na(bank$emp.var.rate))
## [1] 0
sum(is.na(bank$cons.price.idx))
## [1] 0
sum(is.na(bank$cons.conf.idx))
## [1] 0
sum(is.na(bank$nr.employed))
## [1] 0

There are no missing values found.



5) Exploratory data analysis

Job variable

ggplot(bank,aes(job))+geom_bar(aes(fill= y),position = position_dodge())+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Analysis: Admins are the largest group who subscribed to the term deposit and this might be due admins are the largest group in the distribution.

Marital variable

ggplot(bank,aes(marital))+geom_bar(aes(fill= y),position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Analysis: Keep variable

Education

ggplot(bank,aes(education))+geom_bar(aes(fill= y), position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Analysis: Keep variable

Age Boxplot

ggplot(bank,aes(x = y,y = age))+geom_boxplot(aes(fill= y))+xlab("Subscribed")

Analysis: We can observe that age is not so good predictor for the whether customer subscribed term deposit, since the mean is almost same. We recommend NOT TO KEEP IT in the model.

Day_of_week Barplot

ggplot(bank,aes(day_of_week))+geom_bar(aes(fill= y), position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Analysis: We observe that the number of customers subscribed are fairly equal among all the days of the week.Hence this might not be a good predictor for our response variable.

Month Barplot

ggplot(bank,aes(month))+geom_bar(aes(fill= y), position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Analysis: From the plots, we observe that May has highest number of subscribers and this would be a good predictor

Housing Barplot

ggplot(bank,aes(housing))+geom_bar(aes(fill= y), position = position_dodge())+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Analysis: From the plots, we observe that there is no significance of housing on the number of subscribers

6) Analyzing variables with “unknown” categories

Cross Tables

CrossTable(bank$job, bank$y)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##               | bank$y 
##      bank$job |        no |       yes | Row Total | 
## --------------|-----------|-----------|-----------|
##        admin. |       879 |       133 |      1012 | 
##               |     0.547 |     4.445 |           | 
##               |     0.869 |     0.131 |     0.246 | 
##               |     0.240 |     0.295 |           | 
##               |     0.213 |     0.032 |           | 
## --------------|-----------|-----------|-----------|
##   blue-collar |       823 |        61 |       884 | 
##               |     1.627 |    13.235 |           | 
##               |     0.931 |     0.069 |     0.215 | 
##               |     0.224 |     0.135 |           | 
##               |     0.200 |     0.015 |           | 
## --------------|-----------|-----------|-----------|
##  entrepreneur |       140 |         8 |       148 | 
##               |     0.511 |     4.154 |           | 
##               |     0.946 |     0.054 |     0.036 | 
##               |     0.038 |     0.018 |           | 
##               |     0.034 |     0.002 |           | 
## --------------|-----------|-----------|-----------|
##     housemaid |        99 |        11 |       110 | 
##               |     0.011 |     0.091 |           | 
##               |     0.900 |     0.100 |     0.027 | 
##               |     0.027 |     0.024 |           | 
##               |     0.024 |     0.003 |           | 
## --------------|-----------|-----------|-----------|
##    management |       294 |        30 |       324 | 
##               |     0.104 |     0.845 |           | 
##               |     0.907 |     0.093 |     0.079 | 
##               |     0.080 |     0.067 |           | 
##               |     0.071 |     0.007 |           | 
## --------------|-----------|-----------|-----------|
##       retired |       128 |        38 |       166 | 
##               |     2.659 |    21.622 |           | 
##               |     0.771 |     0.229 |     0.040 | 
##               |     0.035 |     0.084 |           | 
##               |     0.031 |     0.009 |           | 
## --------------|-----------|-----------|-----------|
## self-employed |       146 |        13 |       159 | 
##               |     0.137 |     1.117 |           | 
##               |     0.918 |     0.082 |     0.039 | 
##               |     0.040 |     0.029 |           | 
##               |     0.035 |     0.003 |           | 
## --------------|-----------|-----------|-----------|
##      services |       358 |        35 |       393 | 
##               |     0.184 |     1.499 |           | 
##               |     0.911 |     0.089 |     0.095 | 
##               |     0.098 |     0.078 |           | 
##               |     0.087 |     0.008 |           | 
## --------------|-----------|-----------|-----------|
##       student |        63 |        19 |        82 | 
##               |     1.375 |    11.186 |           | 
##               |     0.768 |     0.232 |     0.020 | 
##               |     0.017 |     0.042 |           | 
##               |     0.015 |     0.005 |           | 
## --------------|-----------|-----------|-----------|
##    technician |       611 |        80 |       691 | 
##               |     0.031 |     0.249 |           | 
##               |     0.884 |     0.116 |     0.168 | 
##               |     0.167 |     0.177 |           | 
##               |     0.148 |     0.019 |           | 
## --------------|-----------|-----------|-----------|
##    unemployed |        92 |        19 |       111 | 
##               |     0.474 |     3.857 |           | 
##               |     0.829 |     0.171 |     0.027 | 
##               |     0.025 |     0.042 |           | 
##               |     0.022 |     0.005 |           | 
## --------------|-----------|-----------|-----------|
##       unknown |        35 |         4 |        39 | 
##               |     0.002 |     0.017 |           | 
##               |     0.897 |     0.103 |     0.009 | 
##               |     0.010 |     0.009 |           | 
##               |     0.008 |     0.001 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |      3668 |       451 |      4119 | 
##               |     0.891 |     0.109 |           | 
## --------------|-----------|-----------|-----------|
## 
## 

Analysis: We will be using a 95% critical value, with a df = 11 (12 levels with the unknown - 1 per formula). Critical Value = 4.575 (from Chi-square table). With the cross-table we actually saw that most of the levels don’t get passed the critical value, which means that they are significant.

Recommendation to remove “unknown”: Even when the chi-square contribution of “unknown” doesn’t get passed the critical value, which makes it significant, this is a variable that has only 39 observations, only 4 “yes” out of 451 of the variable, so we recommend to remove them.



Cross table - Marital

CrossTable(bank$marital, bank$y)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
## bank$marital |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##     divorced |       403 |        43 |       446 | 
##              |     0.086 |     0.697 |           | 
##              |     0.904 |     0.096 |     0.108 | 
##              |     0.110 |     0.095 |           | 
##              |     0.098 |     0.010 |           | 
## -------------|-----------|-----------|-----------|
##      married |      2257 |       252 |      2509 | 
##              |     0.231 |     1.879 |           | 
##              |     0.900 |     0.100 |     0.609 | 
##              |     0.615 |     0.559 |           | 
##              |     0.548 |     0.061 |           | 
## -------------|-----------|-----------|-----------|
##       single |       998 |       155 |      1153 | 
##              |     0.805 |     6.550 |           | 
##              |     0.866 |     0.134 |     0.280 | 
##              |     0.272 |     0.344 |           | 
##              |     0.242 |     0.038 |           | 
## -------------|-----------|-----------|-----------|
##      unknown |        10 |         1 |        11 | 
##              |     0.004 |     0.035 |           | 
##              |     0.909 |     0.091 |     0.003 | 
##              |     0.003 |     0.002 |           | 
##              |     0.002 |     0.000 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
##              |     0.891 |     0.109 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

Analysis and recommendation: Same as “job” , the “unknown” category for “marital” has very few observations (10 = no, and 1 = yes), we recommend to remove it.

Cross-table for Education

CrossTable(bank$education, bank$y)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##                     | bank$y 
##      bank$education |        no |       yes | Row Total | 
## --------------------|-----------|-----------|-----------|
##            basic.4y |       391 |        38 |       429 | 
##                     |     0.211 |     1.714 |           | 
##                     |     0.911 |     0.089 |     0.104 | 
##                     |     0.107 |     0.084 |           | 
##                     |     0.095 |     0.009 |           | 
## --------------------|-----------|-----------|-----------|
##            basic.6y |       211 |        17 |       228 | 
##                     |     0.312 |     2.541 |           | 
##                     |     0.925 |     0.075 |     0.055 | 
##                     |     0.058 |     0.038 |           | 
##                     |     0.051 |     0.004 |           | 
## --------------------|-----------|-----------|-----------|
##            basic.9y |       531 |        43 |       574 | 
##                     |     0.771 |     6.269 |           | 
##                     |     0.925 |     0.075 |     0.139 | 
##                     |     0.145 |     0.095 |           | 
##                     |     0.129 |     0.010 |           | 
## --------------------|-----------|-----------|-----------|
##         high.school |       824 |        97 |       921 | 
##                     |     0.018 |     0.146 |           | 
##                     |     0.895 |     0.105 |     0.224 | 
##                     |     0.225 |     0.215 |           | 
##                     |     0.200 |     0.024 |           | 
## --------------------|-----------|-----------|-----------|
##          illiterate |         1 |         0 |         1 | 
##                     |     0.013 |     0.109 |           | 
##                     |     1.000 |     0.000 |     0.000 | 
##                     |     0.000 |     0.000 |           | 
##                     |     0.000 |     0.000 |           | 
## --------------------|-----------|-----------|-----------|
## professional.course |       470 |        65 |       535 | 
##                     |     0.087 |     0.704 |           | 
##                     |     0.879 |     0.121 |     0.130 | 
##                     |     0.128 |     0.144 |           | 
##                     |     0.114 |     0.016 |           | 
## --------------------|-----------|-----------|-----------|
##   university.degree |      1099 |       165 |      1264 | 
##                     |     0.629 |     5.113 |           | 
##                     |     0.869 |     0.131 |     0.307 | 
##                     |     0.300 |     0.366 |           | 
##                     |     0.267 |     0.040 |           | 
## --------------------|-----------|-----------|-----------|
##             unknown |       141 |        26 |       167 | 
##                     |     0.400 |     3.255 |           | 
##                     |     0.844 |     0.156 |     0.041 | 
##                     |     0.038 |     0.058 |           | 
##                     |     0.034 |     0.006 |           | 
## --------------------|-----------|-----------|-----------|
##        Column Total |      3668 |       451 |      4119 | 
##                     |     0.891 |     0.109 |           | 
## --------------------|-----------|-----------|-----------|
## 
## 

Chi-square test for Education

chisq.test(bank$education, bank$y)
## 
##  Pearson's Chi-squared test
## 
## data:  bank$education and bank$y
## X-squared = 22.292, df = 7, p-value = 0.002262

Analysis and recommendation: The variable “Education” is significant (basing our decision on Ho: m1 = m2 = m3 = m4 …, Ha: at least one of the groups has a different mean), removing the “unknown” might be a good option, even when the chi-square contribution is as high as other categories.

CrossTable(bank$housing, bank$y)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
## bank$housing |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |      1637 |       202 |      1839 | 
##              |     0.000 |     0.002 |           | 
##              |     0.890 |     0.110 |     0.446 | 
##              |     0.446 |     0.448 |           | 
##              |     0.397 |     0.049 |           | 
## -------------|-----------|-----------|-----------|
##      unknown |        96 |         9 |       105 | 
##              |     0.067 |     0.542 |           | 
##              |     0.914 |     0.086 |     0.025 | 
##              |     0.026 |     0.020 |           | 
##              |     0.023 |     0.002 |           | 
## -------------|-----------|-----------|-----------|
##          yes |      1935 |       240 |      2175 | 
##              |     0.002 |     0.014 |           | 
##              |     0.890 |     0.110 |     0.528 | 
##              |     0.528 |     0.532 |           | 
##              |     0.470 |     0.058 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
##              |     0.891 |     0.109 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

Chi-square test: Housing

chisq.test(bank$housing, bank$y)
## 
##  Pearson's Chi-squared test
## 
## data:  bank$housing and bank$y
## X-squared = 0.62738, df = 2, p-value = 0.7307

Analysis and recommendation: The variable “Housing” is not significant (basing our decision on Ho: m1 = m2 = m3 = m4 …, Ha: at least one of the groups has a different mean). We recommend NOT TO USE it in the model.

Cross-table for Personal Loan

CrossTable(bank$loan, bank$y)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
##    bank$loan |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |      2975 |       374 |      3349 | 
##              |     0.018 |     0.146 |           | 
##              |     0.888 |     0.112 |     0.813 | 
##              |     0.811 |     0.829 |           | 
##              |     0.722 |     0.091 |           | 
## -------------|-----------|-----------|-----------|
##      unknown |        96 |         9 |       105 | 
##              |     0.067 |     0.542 |           | 
##              |     0.914 |     0.086 |     0.025 | 
##              |     0.026 |     0.020 |           | 
##              |     0.023 |     0.002 |           | 
## -------------|-----------|-----------|-----------|
##          yes |       597 |        68 |       665 | 
##              |     0.039 |     0.318 |           | 
##              |     0.898 |     0.102 |     0.161 | 
##              |     0.163 |     0.151 |           | 
##              |     0.145 |     0.017 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
##              |     0.891 |     0.109 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

Chi-square test: Personal Loan

chisq.test(bank$loan, bank$y)
## 
##  Pearson's Chi-squared test
## 
## data:  bank$loan and bank$y
## X-squared = 1.1297, df = 2, p-value = 0.5684

Analysis and recommendation: The variable “Loan” is not significant (basing our decision on Ho: m1 = m2 = m3 = m4 …, Ha: at least one of the groups has a different mean). We recommend NOT TO USE it in the model.

7) Cleaning data: Modifying pdays

table(bank$pdays)
## 
##    0    1    2    3    4    5    6    7    9   10   11   12   13   14   15   16 
##    2    3    4   52   14    4   42   10    3    8    1    5    2    1    2    2 
##   17   18   19   21  999 
##    1    2    1    1 3959

We saw that we had 3959 observations with the value of “999”, that means not ever contacted before, so we decided to recode the variable as a binary variable, where “999” = 0 (not contacted before), and all the other observations would be coded = 1 (contacted).

Just to verify that we did our recoding correctly, and see the final distribution.

bank$pdays_1<-ifelse(bank$pdays==999,0,1)

table(bank$pdays_1)
## 
##    0    1 
## 3959  160

Defining if pdays_1 is significant

CrossTable(bank$pdays_1, bank$y)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
## bank$pdays_1 |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |      3608 |       351 |      3959 | 
##              |     1.930 |    15.694 |           | 
##              |     0.911 |     0.089 |     0.961 | 
##              |     0.984 |     0.778 |           | 
##              |     0.876 |     0.085 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        60 |       100 |       160 | 
##              |    47.748 |   388.334 |           | 
##              |     0.375 |     0.625 |     0.039 | 
##              |     0.016 |     0.222 |           | 
##              |     0.015 |     0.024 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
##              |     0.891 |     0.109 |           | 
## -------------|-----------|-----------|-----------|
## 
## 

Chi-square pdays_1

chisq.test(bank$pdays_1, bank$y)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  bank$pdays_1 and bank$y
## X-squared = 448.22, df = 1, p-value < 2.2e-16

Analysis: Here we see a “perfect separation” in pdays similar as “Duration”. The reason is exactly the same as the one with “Duration”.

8) Overview of data

Is the data balanced or not

plot(bank$y)

Analysis: There are more number of “no” compared to “yes”, so the data clearly is imbalanced

Overview of Data (numeric variables)

Scatter plot (pairs)

pairs1 = bank[, c("y","age","duration","campaign","previous","previous",
                 "emp.var.rate","cons.price.idx","cons.conf.idx")]

pairs(pairs1)

Overview of Data (Factor variables)

Scatter plot (pairs)

pairs2 = bank[, c("y","job","marital","education","default","housing","loan",
                 "contact","month","day_of_week","poutcome","pdays_1")]

pairs(pairs2)

Analysis: None of the variables seem to be correlated from the above plots.

Data Cleaning:

Remove “Unknown” Categories

First step is to create a copy dataset, to start removing those rows.

bank_clean <- bank

We will remove the “Unknown” Category from the following Factor Variables: Job, Education, Marital.

bank_clean <- subset(bank_clean, bank_clean$job != "unknown")

From 4119 observations 4080 (39 for Job)

bank_clean <- subset(bank_clean, bank_clean$education != "unknown")

From 4080 observations to 3926 (167 for Education)

bank_clean <- subset(bank_clean, bank_clean$marital != "unknown")

From 3926 observations to 3915 (11 for Marital)

Remove variables that are not important

bank_clean = subset(bank_clean, select = -c(age, duration, default, day_of_week, housing, loan, pdays))

Analyze dataset: Is dataset balanced?

table(bank$y)
## 
##   no  yes 
## 3668  451
table(bank_clean$y)
## 
##   no  yes 
## 3493  422

Analysis: We can see from the table that the dataset is not balanced, it is more skewed towards “no”

9) Logistic Regression Model with all variables

Model_1 = glm(data = bank, formula = y ~ .-pdays, family = binomial)

summary(Model_1)
## 
## Call:
## glm(formula = y ~ . - pdays, family = binomial, data = bank)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.9381  -0.2840  -0.1750  -0.1135   2.8050  
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -1.452e+02  1.228e+02  -1.182 0.237231    
## age                           8.665e-03  8.137e-03   1.065 0.286955    
## jobblue-collar               -2.270e-01  2.706e-01  -0.839 0.401439    
## jobentrepreneur              -7.740e-01  4.988e-01  -1.552 0.120700    
## jobhousemaid                  2.476e-01  4.462e-01   0.555 0.578923    
## jobmanagement                -3.135e-01  2.854e-01  -1.099 0.271921    
## jobretired                   -2.065e-01  3.464e-01  -0.596 0.551034    
## jobself-employed             -7.305e-01  4.163e-01  -1.755 0.079318 .  
## jobservices                   1.248e-01  2.821e-01   0.442 0.658172    
## jobstudent                   -7.303e-02  3.981e-01  -0.183 0.854460    
## jobtechnician                 2.209e-01  2.243e-01   0.985 0.324641    
## jobunemployed                 3.363e-01  3.912e-01   0.860 0.389974    
## jobunknown                   -4.622e-01  7.254e-01  -0.637 0.523949    
## maritalmarried                2.781e-01  2.457e-01   1.132 0.257592    
## maritalsingle                 3.275e-01  2.797e-01   1.171 0.241652    
## maritalunknown                2.767e-01  1.145e+00   0.242 0.808961    
## educationbasic.6y             3.220e-01  4.077e-01   0.790 0.429662    
## educationbasic.9y             2.012e-01  3.214e-01   0.626 0.531376    
## educationhigh.school          1.516e-01  3.066e-01   0.494 0.620998    
## educationilliterate          -1.144e+01  5.354e+02  -0.021 0.982946    
## educationprofessional.course  1.168e-01  3.355e-01   0.348 0.727827    
## educationuniversity.degree    3.187e-01  3.088e-01   1.032 0.301945    
## educationunknown              2.424e-01  3.879e-01   0.625 0.532044    
## defaultunknown                1.452e-01  2.099e-01   0.692 0.488982    
## defaultyes                   -8.819e+00  5.354e+02  -0.016 0.986858    
## housingunknown               -5.595e-01  5.219e-01  -1.072 0.283765    
## housingyes                   -6.737e-02  1.376e-01  -0.490 0.624360    
## loanunknown                          NA         NA      NA       NA    
## loanyes                      -1.145e-01  1.870e-01  -0.613 0.540196    
## contacttelephone             -9.556e-01  2.784e-01  -3.432 0.000599 ***
## monthaug                      4.943e-01  4.140e-01   1.194 0.232468    
## monthdec                      8.874e-01  6.760e-01   1.313 0.189255    
## monthjul                      9.378e-02  3.617e-01   0.259 0.795414    
## monthjun                      5.521e-01  4.343e-01   1.271 0.203680    
## monthmar                      2.474e+00  5.182e-01   4.775  1.8e-06 ***
## monthmay                     -3.229e-01  3.007e-01  -1.074 0.282919    
## monthnov                     -3.241e-01  4.188e-01  -0.774 0.438975    
## monthoct                      2.819e-01  5.276e-01   0.534 0.593219    
## monthsep                      1.550e-01  5.965e-01   0.260 0.795025    
## day_of_weekmon                1.426e-01  2.130e-01   0.669 0.503244    
## day_of_weekthu                1.023e-01  2.153e-01   0.475 0.634645    
## day_of_weektue                4.485e-02  2.189e-01   0.205 0.837645    
## day_of_weekwed                3.127e-01  2.222e-01   1.407 0.159371    
## duration                      5.260e-03  2.608e-04  20.167  < 2e-16 ***
## campaign                     -9.987e-02  4.591e-02  -2.175 0.029610 *  
## previous                      1.229e-01  1.714e-01   0.717 0.473384    
## poutcomenonexistent           5.801e-01  2.950e-01   1.966 0.049255 *  
## poutcomesuccess               1.343e+00  6.327e-01   2.123 0.033741 *  
## emp.var.rate                 -8.651e-01  4.651e-01  -1.860 0.062889 .  
## cons.price.idx                1.408e+00  8.109e-01   1.737 0.082414 .  
## cons.conf.idx                 6.440e-02  2.633e-02   2.446 0.014452 *  
## euribor3m                    -1.610e-01  4.172e-01  -0.386 0.699657    
## nr.employed                   2.153e-03  9.924e-03   0.217 0.828263    
## pdays_1                       4.752e-01  6.382e-01   0.745 0.456520    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2845.8  on 4118  degrees of freedom
## Residual deviance: 1597.7  on 4066  degrees of freedom
## AIC: 1703.7
## 
## Number of Fisher Scoring iterations: 12

Analysis: “Duration” attribute highly affects the output target (e.g., if duration=0 then y=‘no’). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. This leads to “perfect separation problem” thus, this input should be discarded if the intention is to have a realistic predictive model.

9) Creating training and testing samples

set.seed(1)
row.number = sample(1:nrow(bank_clean), 0.8*nrow(bank_clean))
train_bank = bank_clean[row.number,]
test_bank = bank_clean[-row.number,]

10) Model with Dataset not including “Unknown” observations and variables

Model_2 = glm(formula = y ~ ., data = train_bank, family = binomial)

summary(Model_2)
## 
## Call:
## glm(formula = y ~ ., family = binomial, data = train_bank)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8982  -0.3985  -0.3319  -0.2632   2.9531  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -2.392e+02  1.191e+02  -2.008  0.04462 *  
## jobblue-collar               -1.755e-01  2.587e-01  -0.679  0.49743    
## jobentrepreneur              -3.715e-01  4.598e-01  -0.808  0.41910    
## jobhousemaid                 -5.622e-02  4.451e-01  -0.126  0.89947    
## jobmanagement                -4.907e-01  2.893e-01  -1.696  0.08987 .  
## jobretired                    5.637e-02  3.085e-01   0.183  0.85503    
## jobself-employed             -4.570e-01  3.896e-01  -1.173  0.24074    
## jobservices                  -2.736e-01  2.792e-01  -0.980  0.32721    
## jobstudent                    2.102e-01  4.137e-01   0.508  0.61127    
## jobtechnician                 5.928e-02  2.223e-01   0.267  0.78977    
## jobunemployed                 3.578e-01  3.601e-01   0.994  0.32045    
## maritalmarried               -7.156e-03  2.194e-01  -0.033  0.97399    
## maritalsingle                -2.181e-02  2.407e-01  -0.091  0.92779    
## educationbasic.6y             8.546e-02  3.743e-01   0.228  0.81937    
## educationbasic.9y             1.033e-01  2.955e-01   0.349  0.72679    
## educationhigh.school         -1.344e-02  2.839e-01  -0.047  0.96226    
## educationilliterate          -1.141e+01  3.247e+02  -0.035  0.97197    
## educationprofessional.course  2.484e-02  3.105e-01   0.080  0.93623    
## educationuniversity.degree    1.651e-01  2.859e-01   0.577  0.56373    
## contacttelephone             -1.117e+00  2.799e-01  -3.992 6.56e-05 ***
## monthaug                     -5.640e-05  4.178e-01   0.000  0.99989    
## monthdec                      7.167e-01  6.885e-01   1.041  0.29785    
## monthjul                     -1.310e-01  3.468e-01  -0.378  0.70561    
## monthjun                     -1.937e-01  4.202e-01  -0.461  0.64485    
## monthmar                      2.101e+00  5.125e-01   4.099 4.15e-05 ***
## monthmay                     -2.602e-01  2.945e-01  -0.883  0.37702    
## monthnov                     -4.898e-01  4.007e-01  -1.222  0.22160    
## monthoct                      2.268e-01  5.071e-01   0.447  0.65466    
## monthsep                     -3.731e-01  6.149e-01  -0.607  0.54400    
## campaign                     -5.033e-02  3.638e-02  -1.384  0.16647    
## previous                      1.760e-01  1.842e-01   0.955  0.33935    
## poutcomenonexistent           5.893e-01  3.079e-01   1.914  0.05566 .  
## poutcomesuccess               1.310e+00  6.638e-01   1.974  0.04839 *  
## emp.var.rate                 -1.236e+00  4.539e-01  -2.723  0.00647 ** 
## cons.price.idx                2.004e+00  7.883e-01   2.542  0.01102 *  
## cons.conf.idx                 6.686e-02  2.639e-02   2.534  0.01128 *  
## euribor3m                    -5.715e-02  4.180e-01  -0.137  0.89123    
## nr.employed                   1.013e-02  9.638e-03   1.051  0.29336    
## pdays_1                       6.096e-01  6.623e-01   0.920  0.35734    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2147.4  on 3131  degrees of freedom
## Residual deviance: 1689.9  on 3093  degrees of freedom
## AIC: 1767.9
## 
## Number of Fisher Scoring iterations: 11

11) Checking collinearity

vif(Model_2)
##                      GVIF Df GVIF^(1/(2*Df))
## job              4.265087 10        1.075218
## marital          1.211676  2        1.049172
## education        3.146399  6        1.100233
## contact          3.110746  1        1.763731
## month           83.753461  9        1.278891
## campaign         1.056256  1        1.027743
## previous         4.319643  1        2.078375
## poutcome        20.444817  2        2.126404
## emp.var.rate   145.429495  1       12.059415
## cons.price.idx  64.240868  1        8.015040
## cons.conf.idx    5.367253  1        2.316733
## euribor3m      142.371477  1       11.931952
## nr.employed    168.653073  1       12.986650
## pdays_1          8.767318  1        2.960966

Analysis: We will run a stepwise removal of the variables, by doing the AIC

Stepwise selection of the variables

glm.null.train_bank = glm(y ~ 1, data = train_bank, family = "binomial")
glm.full.train_bank = glm(y ~ ., 
                      data = train_bank, family = "binomial")
step.AIC1 = step(glm.null.train_bank, scope = list(upper=glm.full.train_bank),
     direction ="both",test ="Chisq", trace = F)


step.AIC1
## 
## Call:  glm(formula = y ~ nr.employed + month + poutcome + contact + 
##     cons.conf.idx + campaign, family = "binomial", data = train_bank)
## 
## Coefficients:
##         (Intercept)          nr.employed             monthaug  
##           47.238245            -0.009203            -0.438389  
##            monthdec             monthjul             monthjun  
##            0.386576             0.038092             0.670823  
##            monthmar             monthmay             monthnov  
##            1.589938            -0.541502            -0.518121  
##            monthoct             monthsep  poutcomenonexistent  
##           -0.029950            -1.021362             0.292429  
##     poutcomesuccess     contacttelephone        cons.conf.idx  
##            1.913730            -0.817545             0.045876  
##            campaign  
##           -0.052587  
## 
## Degrees of Freedom: 3131 Total (i.e. Null);  3116 Residual
## Null Deviance:       2147 
## Residual Deviance: 1710  AIC: 1742
summary(step.AIC1)
## 
## Call:
## glm(formula = y ~ nr.employed + month + poutcome + contact + 
##     cons.conf.idx + campaign, family = "binomial", data = train_bank)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0646  -0.3861  -0.3460  -0.2592   2.7320  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         47.238245   5.338645   8.848  < 2e-16 ***
## nr.employed         -0.009203   0.001062  -8.667  < 2e-16 ***
## monthaug            -0.438389   0.341300  -1.284 0.198979    
## monthdec             0.386576   0.641209   0.603 0.546584    
## monthjul             0.038092   0.308172   0.124 0.901627    
## monthjun             0.670823   0.304145   2.206 0.027412 *  
## monthmar             1.589938   0.423864   3.751 0.000176 ***
## monthmay            -0.541502   0.266508  -2.032 0.042170 *  
## monthnov            -0.518121   0.312337  -1.659 0.097146 .  
## monthoct            -0.029950   0.418612  -0.072 0.942964    
## monthsep            -1.021362   0.473896  -2.155 0.031143 *  
## poutcomenonexistent  0.292429   0.197733   1.479 0.139166    
## poutcomesuccess      1.913730   0.289441   6.612 3.80e-11 ***
## contacttelephone    -0.817545   0.205408  -3.980 6.89e-05 ***
## cons.conf.idx        0.045876   0.015380   2.983 0.002855 ** 
## campaign            -0.052587   0.036262  -1.450 0.147005    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2147.4  on 3131  degrees of freedom
## Residual deviance: 1709.9  on 3116  degrees of freedom
## AIC: 1741.9
## 
## Number of Fisher Scoring iterations: 6

13) Goodness of Fit: Hosmer-Lemeshow test

hoslem.test(step.AIC1$y, fitted(step.AIC1), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  step.AIC1$y, fitted(step.AIC1)
## X-squared = 4.2476, df = 8, p-value = 0.8341

Analysis: Since the Hosmer-Lemeshow test yielded a large p-value of 0.8341, we do not reject our null hypothesis. This means that the model is adequate

13) PredProbs

test_bank$PredProb = predict.glm(step.AIC1, newdata = test_bank, type = "response")
test_bank$PredSub = ifelse(test_bank$PredProb >= 0.5, "yes", "no")


table(test_bank$PredSub)
## 
##  no yes 
## 751  32
caret::confusionMatrix(test_bank$y,as.factor(test_bank$PredSub))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  688  12
##        yes  63  20
##                                           
##                Accuracy : 0.9042          
##                  95% CI : (0.8814, 0.9239)
##     No Information Rate : 0.9591          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3069          
##                                           
##  Mcnemar's Test P-Value : 7.764e-09       
##                                           
##             Sensitivity : 0.9161          
##             Specificity : 0.6250          
##          Pos Pred Value : 0.9829          
##          Neg Pred Value : 0.2410          
##              Prevalence : 0.9591          
##          Detection Rate : 0.8787          
##    Detection Prevalence : 0.8940          
##       Balanced Accuracy : 0.7706          
##                                           
##        'Positive' Class : no              
## 
PredProb1 = prediction(predict.glm(step.AIC1, newdata = test_bank, type = "response"), test_bank$y)

# Computing threshold for cutoff to best trade off sensitivity and specificity
plot(unlist(performance(PredProb1,'sens')@x.values),unlist(performance(PredProb1,'sens')@y.values), type='l', lwd=2, ylab = "", xlab = 'Cutoff')
mtext('Sensitivity',side=2)
mtext('Sensitivity vs. Specificity Plot for AIC Model', side=3)

# Second specificity in same plot
par(new=TRUE)
plot(unlist(performance(PredProb1,'spec')@x.values),unlist(performance(PredProb1,'spec')@y.values), type='l', lwd=2,col='red', ylab = "", xlab = 'Cutoff')
axis(4,at=seq(0,1,0.2)) 
mtext('Specificity',side=4, col='red')

par(new=TRUE)

min.diff <-which.min(abs(unlist(performance(PredProb1, "sens")@y.values) - unlist(performance(PredProb1, "spec")@y.values)))
min.x<-unlist(performance(PredProb1, "sens")@x.values)[min.diff]
min.y<-unlist(performance(PredProb1, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

test_bank$PredSubOptimal = ifelse(test_bank$PredProb >= 0.08, "yes", "no")


table(test_bank$PredSubOptimal)
## 
##  no yes 
## 537 246
caret::confusionMatrix(test_bank$y,as.factor(test_bank$PredSubOptimal))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  516 184
##        yes  21  62
##                                           
##                Accuracy : 0.7382          
##                  95% CI : (0.7059, 0.7687)
##     No Information Rate : 0.6858          
##     P-Value [Acc > NIR] : 0.0007765       
##                                           
##                   Kappa : 0.2595          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9609          
##             Specificity : 0.2520          
##          Pos Pred Value : 0.7371          
##          Neg Pred Value : 0.7470          
##              Prevalence : 0.6858          
##          Detection Rate : 0.6590          
##    Detection Prevalence : 0.8940          
##       Balanced Accuracy : 0.6065          
##                                           
##        'Positive' Class : no              
## 

Analysis: Even though the accuracy decreases, the optimal cutoff gives us a higher true subscription values. Hence it is important to have the optimal cutoff instead of the arbitrary (0.5) cutoff.