Import data and packages

df_full <- read.csv("/Users/apple/Desktop/kun_hw/bank-full.csv", sep = ";")
library(magrittr) 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.7     ✔ purrr   0.3.4
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
require(ISLR)
## Loading required package: ISLR
require(grid)
## Loading required package: grid
library(ggmosaic)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
knitr::opts_chunk$set(echo = TRUE)

Browse and clean the data

ncol(df_full)
## [1] 17
nrow(df_full)
## [1] 45211
str(df_full)
## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : chr  "management" "technician" "entrepreneur" "blue-collar" ...
##  $ marital  : chr  "married" "single" "married" "married" ...
##  $ education: chr  "tertiary" "secondary" "secondary" "unknown" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : chr  "yes" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "no" "yes" "no" ...
##  $ contact  : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr  "may" "may" "may" "may" ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...
summary(df_full)
##       age            job              marital           education        
##  Min.   :18.00   Length:45211       Length:45211       Length:45211      
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.94                                                           
##  3rd Qu.:48.00                                                           
##  Max.   :95.00                                                           
##    default             balance         housing              loan          
##  Length:45211       Min.   : -8019   Length:45211       Length:45211      
##  Class :character   1st Qu.:    72   Class :character   Class :character  
##  Mode  :character   Median :   448   Mode  :character   Mode  :character  
##                     Mean   :  1362                                        
##                     3rd Qu.:  1428                                        
##                     Max.   :102127                                        
##    contact               day           month              duration     
##  Length:45211       Min.   : 1.00   Length:45211       Min.   :   0.0  
##  Class :character   1st Qu.: 8.00   Class :character   1st Qu.: 103.0  
##  Mode  :character   Median :16.00   Mode  :character   Median : 180.0  
##                     Mean   :15.81                      Mean   : 258.2  
##                     3rd Qu.:21.00                      3rd Qu.: 319.0  
##                     Max.   :31.00                      Max.   :4918.0  
##     campaign          pdays          previous          poutcome        
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000   Length:45211      
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000   Class :character  
##  Median : 2.000   Median : -1.0   Median :  0.0000   Mode  :character  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803                     
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000                     
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000                     
##       y            
##  Length:45211      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Upon the first glance of the dataset, we can see that age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous and poutcome are the variable. The binary ‘deposit’ is the response variable.

Here we can see the distribution of the target varible.

barplot(table(df_full$y), col="cyan", ylim=c(0,5000),
         xlab="Deposit",ylab="Frequency")

df_full <- df_full %>%
  mutate(y_binary = ifelse(y == "no",0,1))
  summary(df_full$y_binary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.117   0.000   1.000

Data Validation

sum(is.na(df_full))
## [1] 0
sum(!complete.cases(df_full))
## [1] 0

Here we can see there is no missing data.

See the categorical variables

1. job

table(df_full$job)
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##          5171          9732          1487          1240          9458 
##       retired self-employed      services       student    technician 
##          2264          1579          4154           938          7597 
##    unemployed       unknown 
##          1303           288

We have 288 unknown jobs.

ggplot(data = df_full) +
  geom_mosaic(aes(x = product(y,job), fill = y))
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## Please use `unite()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

We can see higher response among students and retired people, also, we can ignore “unknown”. No big effect seen here.

2. marital

str(df_full$marital)
##  chr [1:45211] "married" "single" "married" "married" "single" "married" ...
levels(df_full$marital)
## NULL
plot(table(df_full$marital))

ggplot(data = df_full) +
  geom_mosaic(aes(x = product(y,marital), fill = y))

Here we can see no big effect of marriage. Singles slightly more like to say “yes”.

3. education

str(df_full$education)
##  chr [1:45211] "tertiary" "secondary" "secondary" "unknown" "unknown" ...
levels(df_full$education)
## NULL
table(df_full$education)
## 
##   primary secondary  tertiary   unknown 
##      6851     23202     13301      1857
plot(table(df_full$education))

ggplot(data = df_full) +
  geom_mosaic(aes(x = product(y, education), fill = y)) 

It appears that a positive correlation exists between the number of years of education and the odds of subscribing to a term deposit.

4. default

str(df_full$default)
##  chr [1:45211] "no" "no" "no" "no" "no" "no" "no" "yes" "no" "no" "no" "no" ...
levels(df_full$default)
## NULL
table(df_full$default)
## 
##    no   yes 
## 44396   815

It seems like this variable is not useble in the final model.

5. housing

str(df_full$housing)
##  chr [1:45211] "yes" "yes" "yes" "yes" "no" "yes" "yes" "yes" "yes" "yes" ...
levels(df_full$housing)
## NULL
table(df_full$housing)
## 
##    no   yes 
## 20081 25130
plot(table(df_full$housing))

ggplot(data = df_full) +
  geom_mosaic(aes(x = product(y, housing), fill = y))

Not much difference between those who have housing loans and those who do not.

6.loan

table(df_full$loan)
## 
##    no   yes 
## 37967  7244
  ggplot(data = df_full) +
  geom_mosaic(aes(x = product(y, loan), fill = y))

Those do not have personal loans are more likely to say “yes”.

7.contact

table(df_full$contact)
## 
##  cellular telephone   unknown 
##     29285      2906     13020
ggplot(data = df_full) +
  geom_mosaic(aes(x = product(y, contact), fill = y))

There are 13020 unknown contacts. We can see cellular responders are more likely to subscribe to a term deposit.

8. month

  ggplot(data = df_full) +
  aes(x = month, y = ..count../nrow(df_full), fill = y) +
  geom_bar() 

month_table <- table(df_full$month, df_full$y)
month_tab <- as.data.frame(prop.table(month_table, 2))
colnames(month_tab) <-  c("month", "y", "percent")

ggplot(data = month_tab, aes(x = month, y = percent, fill = y)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 2/3) 

We can see month may be a very important varible.

9. poutcome

table(df_full$poutcome)
## 
## failure   other success unknown 
##    4901    1840    1511   36959
  ggplot(data = df_full) +
  geom_mosaic(aes(x = product(poutcome), fill = y))

Here we can see a lot of unknown poutcomes. So we will not use this varible for modeling.

Continuous variables

1. age

summary(df_full$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   33.00   39.00   40.94   48.00   95.00
ggplot(data = df_full) +
  aes(x = age) +
  geom_bar() +
  geom_vline(xintercept = c(30, 60), 
             col = "red",
             linetype = "dashed") +
  facet_grid(y ~ .,
             scales = "free_y") +
  scale_x_continuous(breaks = seq(0, 100, 5))

df_full = df_full %>% 
  mutate(age_discrete = if_else(age > 60, "high", if_else(age > 30, "mid", "low")))

Here we can see banks are not very much interested by contacting the older population. Even though, elderly persons are more likely to subscribe to a term deposit.

2. day

  ggplot(data = df_full) +
  aes(x = day, y = ..count../nrow(df_full), fill = y) +
  geom_bar()

4. pdays

df_full <- df_full %>% 
  mutate(pdays_dummy = if_else(pdays == 999, "0", "1"))
  ggplot(data = df_full) +
  geom_mosaic(aes(x = product(y, pdays_dummy), fill = y))

Final Data Preparition

Duplicating data for safe keeping

df_dup <- df_full

Removing and transforming “unknowns”

df_dup <- df_dup %>% 
  filter(job != "unknown")

df_dup <- df_dup %>% 
  filter(marital != "unknown")

df_dup = df_dup %>% 
  mutate(education = recode(education, "unknown" = "university.degree"))

Converting variables to factors with ordered levels

df_dup$contact <- factor(df_dup$contact, order = TRUE, levels =c('telephone', 'cellular'))
df_dup$education <- factor(df_dup$education, order = TRUE, levels =c('illiterate','basic.4y', 'basic.6y','basic.9y', 'high.school','professional.course','university.degree'))
df_dup$age_discrete <- factor(df_dup$age_discrete, order = TRUE, levels =c('low', 'mid','high'))
df_dup$job <- factor(df_dup$job, order = TRUE, levels =c('blue-collar', 'services','entrepreneur', 'housemaid', 'self-employed','technician', 'management','admin.','unemployed', 'retired','student'))
df_dup$marital <- factor(df_dup$marital, order = TRUE, levels =c('married', 'divorced', 'single'))
df_dup$month <- factor(df_dup$month, order = TRUE, levels =c('mar', 'apr','may', 'jun','jul', 'aug', 'sep','oct', 'nov','dec'))
df_dup$previous_binned <- factor(df_dup$previous_binned, order = TRUE, levels =c('0', '1','2+'))

Set training and tesing datasets

set.seed(1984)
training <- createDataPartition(df_dup$y_binary, p = 0.8, list=FALSE)

train_data <- df_dup[training,]
test_data <- df_dup[-training,]

Logistic Regression

model <- glm(y_binary ~ age_discrete + marital + month + job + contact + previous_binned + pdays, family=binomial(link='logit'),data=train_data)

summary(model)
## 
## Call:
## glm(formula = y_binary ~ age_discrete + marital + month + job + 
##     contact + previous_binned + pdays, family = binomial(link = "logit"), 
##     data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8622  -0.5175  -0.4354  -0.3662   2.6126  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.3606942  0.0703407  -5.128 2.93e-07 ***
## age_discrete.L     0.6215273  0.0906480   6.856 7.06e-12 ***
## age_discrete.Q     0.6034711  0.0530182  11.382  < 2e-16 ***
## marital.L          0.1903646  0.0354693   5.367 8.00e-08 ***
## marital.Q         -0.0434142  0.0537657  -0.807  0.41940    
## month.L           -0.0480638  0.1084475  -0.443  0.65762    
## month.Q            0.8138362  0.1116577   7.289 3.13e-13 ***
## month.C           -0.6268737  0.1078767  -5.811 6.21e-09 ***
## month^4            0.6404933  0.0849781   7.537 4.80e-14 ***
## month^5            0.3796797  0.0804174   4.721 2.34e-06 ***
## month^6            1.5049113  0.0712363  21.126  < 2e-16 ***
## month^7            0.9007997  0.0697739  12.910  < 2e-16 ***
## month^8           -1.0147380  0.0824023 -12.314  < 2e-16 ***
## month^9            0.2826455  0.0660641   4.278 1.88e-05 ***
## job.L              0.6025173  0.0936266   6.435 1.23e-10 ***
## job.Q              0.0917363  0.0814869   1.126  0.26026    
## job.C              0.0026683  0.0966357   0.028  0.97797    
## job^4             -0.0209705  0.0930948  -0.225  0.82178    
## job^5              0.2016689  0.0938861   2.148  0.03171 *  
## job^6              0.0673022  0.0913305   0.737  0.46118    
## job^7              0.2692993  0.1008115   2.671  0.00756 ** 
## job^8              0.2250140  0.1021595   2.203  0.02762 *  
## job^9              0.0404592  0.0977821   0.414  0.67904    
## job^10             0.2300963  0.0764311   3.011  0.00261 ** 
## contact.L          0.2755660  0.0559369   4.926 8.38e-07 ***
## previous_binned.L  0.8895023  0.0584090  15.229  < 2e-16 ***
## previous_binned.Q -0.3081200  0.0627427  -4.911 9.07e-07 ***
## pdays             -0.0025648  0.0003236  -7.926 2.26e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18713  on 22364  degrees of freedom
## Residual deviance: 16544  on 22337  degrees of freedom
##   (13574 observations deleted due to missingness)
## AIC: 16600
## 
## Number of Fisher Scoring iterations: 5

Let’s look at the prediction of the model on the test set (test_data).

pred.train <- predict(model,test_data)
pred.train <- ifelse(pred.train > 0.5,1,0)
mean(pred.train == test_data$y_binary)
## [1] NA
t1 <- table(pred.train,test_data$y_binary)
presicion <- t1[1,1]/(sum(t1[1,]))
recall <- t1[1,1]/(sum(t1[,1]))
presicion
## [1] 0.8541629
F1<- 2*presicion*recall/(presicion+recall)
F1
## [1] 0.9177857

According the presicion and recall of the model, we can say this model works well.