Bank Marketing Data Exploration and Analysis

LOAD PACKAGE INTO LIBRARY FOR USE:

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(ggplot2)
library(tidyverse)
library(readr)
library(dplyr)

LOAD THE DATASET:

bank <- read.csv("~/Downloads/bank-additional-full-official.csv", sep=";")

OVERVIEW OF THE DATASET:

head(bank)

##   age       job marital   education default housing loan   contact month
## 1  56 housemaid married    basic.4y      no      no   no telephone   may
## 2  57  services married high.school unknown      no   no telephone   may
## 3  37  services married high.school      no     yes   no telephone   may
## 4  40    admin. married    basic.6y      no      no   no telephone   may
## 5  56  services married high.school      no      no  yes telephone   may
## 6  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1         mon      261        1   999        0 nonexistent          1.1
## 2         mon      149        1   999        0 nonexistent          1.1
## 3         mon      226        1   999        0 nonexistent          1.1
## 4         mon      151        1   999        0 nonexistent          1.1
## 5         mon      307        1   999        0 nonexistent          1.1
## 6         mon      198        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 1         93.994         -36.4     4.857        5191 no
## 2         93.994         -36.4     4.857        5191 no
## 3         93.994         -36.4     4.857        5191 no
## 4         93.994         -36.4     4.857        5191 no
## 5         93.994         -36.4     4.857        5191 no
## 6         93.994         -36.4     4.857        5191 no

tail(bank)

##       age         job marital           education default housing loan  contact
## 41183  29  unemployed  single            basic.4y      no     yes   no cellular
## 41184  73     retired married professional.course      no     yes   no cellular
## 41185  46 blue-collar married professional.course      no      no   no cellular
## 41186  56     retired married   university.degree      no     yes   no cellular
## 41187  44  technician married professional.course      no      no   no cellular
## 41188  74     retired married professional.course      no     yes   no cellular
##       month day_of_week duration campaign pdays previous    poutcome
## 41183   nov         fri      112        1     9        1     success
## 41184   nov         fri      334        1   999        0 nonexistent
## 41185   nov         fri      383        1   999        0 nonexistent
## 41186   nov         fri      189        2   999        0 nonexistent
## 41187   nov         fri      442        1   999        0 nonexistent
## 41188   nov         fri      239        3   999        1     failure
##       emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed   y
## 41183         -1.1         94.767         -50.8     1.028      4963.6  no
## 41184         -1.1         94.767         -50.8     1.028      4963.6 yes
## 41185         -1.1         94.767         -50.8     1.028      4963.6  no
## 41186         -1.1         94.767         -50.8     1.028      4963.6  no
## 41187         -1.1         94.767         -50.8     1.028      4963.6 yes
## 41188         -1.1         94.767         -50.8     1.028      4963.6  no

names(bank)

##  [1] "age"            "job"            "marital"        "education"     
##  [5] "default"        "housing"        "loan"           "contact"       
##  [9] "month"          "day_of_week"    "duration"       "campaign"      
## [13] "pdays"          "previous"       "poutcome"       "emp.var.rate"  
## [17] "cons.price.idx" "cons.conf.idx"  "euribor3m"      "nr.employed"   
## [21] "y"

class(bank)

## [1] "data.frame"

dim(bank)

## [1] 41188    21

nrow(bank)

## [1] 41188

ncol(bank)

## [1] 21

EXPLORE THE DATA STRUCTURE:

str(bank)

## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : chr  "housemaid" "services" "services" "admin." ...
##  $ marital       : chr  "married" "married" "married" "married" ...
##  $ education     : chr  "basic.4y" "high.school" "high.school" "basic.6y" ...
##  $ default       : chr  "no" "unknown" "no" "no" ...
##  $ housing       : chr  "no" "no" "yes" "no" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "may" "may" ...
##  $ day_of_week   : chr  "mon" "mon" "mon" "mon" ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : chr  "no" "no" "no" "no" ...

DATA PREPARATION:

# Convert column "y" into integer, the column "y" has binary values "yes" and "no" (subscribed to a term deposit). I'm going to encode it into 1s and 0s.
bank <- bank %>%
  mutate(y=ifelse(y=="no", 0, 1))
bank$y <- as.integer(bank$y)

# Convert character variables to factor variables for interpretation purposes for the logistic regressions.
bank$job <- as.factor(bank$job)
bank$marital <- as.factor(bank$marital)
bank$education <- as.factor(bank$education)
bank$default <- as.factor(bank$default)
bank$loan <- as.factor(bank$loan)
bank$housing <- as.factor(bank$housing)
bank$contact <- as.factor(bank$contact)
bank$month <- as.factor(bank$month)
bank$day_of_week <- as.factor(bank$day_of_week)
bank$poutcome <- as.factor(bank$poutcome)

CHECKING MISSING VALUE IN THE DATASET:

colSums(is.na(bank))

##            age            job        marital      education        default 
##              0              0              0              0              0 
##        housing           loan        contact          month    day_of_week 
##              0              0              0              0              0 
##       duration       campaign          pdays       previous       poutcome 
##              0              0              0              0              0 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##              0              0              0              0              0 
##              y 
##              0

# There is no missing values in any column of the dataset. However, according to the data documentation, “unknown” value means NA therefore we will check the total number of NA values in the data:

sum(bank == "unknown")

## [1] 12718

# There are 12,718 unknown values in the dataset. Next we will identify which variable has the highest number of missing value:

bank %>% 
  summarise_all(list(~sum(. == "unknown"))) %>% 
  gather(key = "variable", value = "nr_unknown") %>% 
  arrange(-nr_unknown)

##          variable nr_unknown
## 1         default       8597
## 2       education       1731
## 3         housing        990
## 4            loan        990
## 5             job        330
## 6         marital         80
## 7             age          0
## 8         contact          0
## 9           month          0
## 10    day_of_week          0
## 11       duration          0
## 12       campaign          0
## 13          pdays          0
## 14       previous          0
## 15       poutcome          0
## 16   emp.var.rate          0
## 17 cons.price.idx          0
## 18  cons.conf.idx          0
## 19      euribor3m          0
## 20    nr.employed          0
## 21              y          0

# There are 6 variables that have missing values in the dataset.

CALCULATE CONVERSION RATE:

# Total number of conversions
sum(bank$y)

## [1] 4640

# Total number of clients in the data
nrow(bank)

## [1] 41188

# Conversion rate
sum(bank$y)/nrow(bank)*100

## [1] 11.26542

# The conversion rate of this dataset is 11.26%

EXPLORATORY DATA ANALYSIS: In this section data analysis will be applied in order to identify the demographic target segmentation of bank marketing campaign. The demographic factors include: age, education, job, marital status.

# Age: What is the age range of bank marketing target segment? What is the average age?

mean(bank$age)

## [1] 40.02406

min(bank$age)

## [1] 17

max(bank$age)

## [1] 98

# Ages range from 17 to 98, the average is 40 years old.

bank %>% 
  ggplot() +
  aes(x = age) +
  geom_bar() +
  geom_vline(xintercept = c(30, 60), 
             col = "red",
             linetype = "dashed") +
  facet_grid(y ~ .,
             scales = "free_y") +
  scale_x_continuous(breaks = seq(0, 100, 5))

# From the graph, it is clear that bank is not interested in contacting older population after 60 years old.

According to the chart, we can see that the over 60 age group have the highest conversion rate of taking a term deposit, however this group has received less attention and contact from the bank. It might happen because the older age group is hard to reach out in terms of telemarketing as they are not quite familiar with technology.

#Job: What kind of jobs are represented by the clients pool?
table(bank$job)

## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##         10422          9254          1456          1060          2924 
##       retired self-employed      services       student    technician 
##          1720          1421          3969           875          6743 
##    unemployed       unknown 
##          1014           330

mytable <- table(bank$job, bank$y)
tab <- as.data.frame(prop.table(mytable, 2))
colnames(tab) <-  c("job", "y", "perc")

ggplot(data = tab, aes(x = job, y = perc, fill = y)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 2/3) + 
    xlab("Job")+
    ylab("Percent") + theme(axis.text.x=element_text(angle = 90, hjust = 0))

#group the data
conversionsJob <- bank %>%
  group_by(Job=job) %>%
  summarize(TotalCount=n(), NumberConversions=sum(y)) %>%
  mutate(ConversionRate=NumberConversions/TotalCount*100) %>%
  arrange(desc(ConversionRate))

#order the jobs DESC for the bar chart
conversionsJob$Job <- factor(conversionsJob$Job, 
                                   levels = conversionsJob$Job[order(-conversionsJob$ConversionRate)])

# visualizing conversions by job
ggplot(conversionsJob, aes(x=Job, y=ConversionRate)) +
  geom_bar(width=0.5, stat = "identity", fill="darkblue") +
  labs(title="Conversion Rates by Job") +
  theme(axis.text.x = element_text(angle = 90))

Surprisingly, students, retired people, admin and unemployed categories show the best relative frequencies of term deposit subscription. The demand for deposit subscription in blue-collar category is quite low.

# Marital status: How is the marital status of potential clients?
table(bank$marital)

## 
## divorced  married   single  unknown 
##     4612    24928    11568       80

mytable <- table(bank$marital, bank$y)
tab <- as.data.frame(prop.table(mytable, 2))
colnames(tab) <-  c("marital", "y", "perc")

ggplot(data = tab, aes(x = marital, y = perc, fill = y)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 2/3) + 
    xlab("Marital")+
    ylab("Percent")

From the chart, it is clear that single client is likely to subscribe more often to term deposits than others groups (divorced and married).

# Education: What is the education level of the target clients?

table(bank$education)

## 
##            basic.4y            basic.6y            basic.9y         high.school 
##                4176                2292                6045                9515 
##          illiterate professional.course   university.degree             unknown 
##                  18                5243               12168                1731

mytable <- table(bank$education, bank$y)
tab <- as.data.frame(prop.table(mytable, 2))
colnames(tab) <-  c("education", "y", "perc")

ggplot(data = tab, aes(x = education, y = perc, fill = y)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 2/3) + 
    xlab("Education") +
    ylab("Percent") + theme(axis.text.x=element_text(angle = 90, hjust = 0))

It appears that there is a positive correlation between the number of years of education and the likelihood to subscribe to a term deposit. People with university degree is the group that have the highest likelihood of taking up term deposit. The three group that bank marketing should focus on is high school, professional course and university degree. Also, I would recommend limit marketing efforts on groups “basic.6y”, “basic.9y” and unknown. For the illiterate group, as the number of clients are too low, I will not recommend forcus on this segment.

# Month: Which month record the highest subscription of term deposit?

mytable <- table(bank$month, bank$y)
tab <- as.data.frame(prop.table(mytable, 2))
colnames(tab) <-  c("month", "y", "perc")

ggplot(data = tab, aes(x = month, y = perc, fill = y)) + 
  geom_bar(stat = 'identity', position = 'dodge', alpha = 2/3) + 
    xlab("Month")+
    ylab("Percent")

First of all, we can notice that no contact has been made during January and February. The highest spike occurs during May, but it has the worst ratio of subscribers over persons contacted. Every month with a very low frequency of contact (March, September, October and December) shows very good results. December aside, there are enough observations to conclude this isn’t pure luck, so this feature will probably be very important in models.

BIVARIATE ANALYSIS OF SOCIAL AND ECONOMIC ATTRIBUTES

# Correlation analysis
library(corrplot)

## corrplot 0.90 loaded

bank %>% 
  select(emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed) %>% 
  cor() %>% 
  corrplot(method = "number",
           type = "upper",
           tl.cex = 0.8,
           tl.srt = 45,
           tl.col = "black")

bank %>%
  ggplot() +
  geom_point(aes(x=age, y=cons.price.idx, color=as.factor(y))) + ylab("consumer price index (monthly)") + labs(title="Consumer price index based on age group scatterplot")

The Consumer Price Index is what is used to measure these average changes in prices over time that consumers pay for goods and services, as we can see from the chart, the CPI data is quite stable as it only fluctuate between 93-94 which determine a stable economy with low inflation rate.

bank %>%
  ggplot() +
  geom_point(aes(x=age, y=cons.conf.idx, color=as.factor(y))) + ylab("consumer confidence index (monthly)") + labs(title="Consumer confidence index based on age group scatterplot")

The Consumer Confidence Index measures how optimistic consumer are at the time. It is suggested that the more confident consumers feel about their finance, they are more likely to apply for bank loan. However, as seen from the graph consumers with higher CCI do not show higher pattern of taking up term deposit. Instead, there are certain number of consumers age under 60 who refused to take up loan.

LINEAR REGRESSION MODEL:

# Duration factor is filtered out because it does not make sense to have 0 duration (e.g., if duration=0 then y='no'):
bank$duration <- NULL

# pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
# Since 999 is an arbitrary dummy variable, we should make it NA:
bank$pdays[bank$pdays == 999] <- NA

# Since the vast majority of the data points are NA now, we will discard this column:
bank$pdays <- NULL
# We also drop column education as it is insignificant:
bank$education <- NULL

# Run the initial linear regression model:
library(readxl)
regmodel <- lm(y ~ ., data=bank)
summary(regmodel)

## 
## Call:
## lm(formula = y ~ ., data = bank)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87337 -0.08107 -0.05212 -0.02770  1.08620 
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -2.569e+01  3.389e+00  -7.581 3.50e-14 ***
## age                  7.477e-05  1.698e-04   0.440 0.659647    
## jobblue-collar      -1.395e-02  4.194e-03  -3.325 0.000885 ***
## jobentrepreneur     -6.458e-03  7.939e-03  -0.813 0.415970    
## jobhousemaid        -1.040e-02  9.146e-03  -1.137 0.255657    
## jobmanagement       -4.439e-03  5.953e-03  -0.746 0.455921    
## jobretired           2.564e-02  8.250e-03   3.108 0.001886 ** 
## jobself-employed    -5.547e-03  7.959e-03  -0.697 0.485858    
## jobservices         -1.241e-02  5.296e-03  -2.343 0.019149 *  
## jobstudent           3.106e-02  1.030e-02   3.017 0.002555 ** 
## jobtechnician       -2.552e-03  4.404e-03  -0.579 0.562286    
## jobunemployed       -5.648e-03  9.267e-03  -0.609 0.542241    
## jobunknown          -1.483e-02  1.581e-02  -0.938 0.348219    
## maritalmarried       3.321e-03  4.553e-03   0.729 0.465856    
## maritalsingle        7.887e-03  5.238e-03   1.506 0.132136    
## maritalunknown       2.744e-02  3.172e-02   0.865 0.386922    
## defaultunknown      -1.281e-02  3.634e-03  -3.525 0.000424 ***
## defaultyes          -4.504e-02  1.621e-01  -0.278 0.781118    
## housingunknown      -5.979e-03  9.171e-03  -0.652 0.514426    
## housingyes          -2.123e-03  2.825e-03  -0.751 0.452466    
## loanunknown                 NA         NA      NA       NA    
## loanyes             -1.300e-03  3.870e-03  -0.336 0.736835    
## contacttelephone    -7.044e-02  5.435e-03 -12.961  < 2e-16 ***
## monthaug             9.708e-02  1.257e-02   7.725 1.14e-14 ***
## monthdec             9.895e-02  2.308e-02   4.287 1.82e-05 ***
## monthjul             2.770e-02  7.730e-03   3.583 0.000340 ***
## monthjun            -7.287e-02  1.246e-02  -5.847 5.05e-09 ***
## monthmar             2.686e-01  1.554e-02  17.287  < 2e-16 ***
## monthmay            -3.916e-02  7.288e-03  -5.374 7.76e-08 ***
## monthnov            -3.790e-02  9.411e-03  -4.027 5.66e-05 ***
## monthoct             8.111e-03  1.455e-02   0.558 0.577103    
## monthsep             2.576e-02  1.778e-02   1.449 0.147346    
## day_of_weekmon      -1.793e-02  4.410e-03  -4.066 4.79e-05 ***
## day_of_weekthu       6.905e-03  4.399e-03   1.570 0.116491    
## day_of_weektue       3.672e-03  4.480e-03   0.820 0.412468    
## day_of_weekwed       1.165e-02  4.465e-03   2.609 0.009087 ** 
## campaign            -1.915e-03  5.100e-04  -3.755 0.000173 ***
## previous             6.003e-03  6.414e-03   0.936 0.349375    
## poutcomenonexistent  5.827e-02  8.929e-03   6.527 6.81e-11 ***
## poutcomesuccess      3.439e-01  9.358e-03  36.749  < 2e-16 ***
## emp.var.rate        -2.044e-01  1.358e-02 -15.048  < 2e-16 ***
## cons.price.idx       2.692e-01  2.260e-02  11.913  < 2e-16 ***
## cons.conf.idx        4.518e-03  7.793e-04   5.797 6.80e-09 ***
## euribor3m            6.894e-02  1.123e-02   6.138 8.43e-10 ***
## nr.employed          1.038e-04  2.704e-04   0.384 0.701245    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2806 on 41144 degrees of freedom
## Multiple R-squared:  0.2132, Adjusted R-squared:  0.2124 
## F-statistic: 259.3 on 43 and 41144 DF,  p-value: < 2.2e-16

Explain the linear regression model:

The regression model above have R-squared value of 0.2132, which means the model explains 21.32% of the total variability in the dependent variable y. It is clear that the model explains a small proportion of the total uncertainty so it is not a fit model. Therefore we will run another regression model - the logistics regression.

# Factor the dependent variable:
bank$y <- as.factor(bank$y)

# Check the dataset
glimpse(bank)

## Rows: 41,188
## Columns: 18
## $ age            <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job            <fct> housemaid, services, services, admin., services, servic…
## $ marital        <fct> married, married, married, married, married, married, m…
## $ default        <fct> no, unknown, no, no, no, unknown, no, unknown, no, no, …
## $ housing        <fct> no, no, yes, no, no, no, no, no, yes, yes, no, yes, no,…
## $ loan           <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes, n…
## $ contact        <fct> telephone, telephone, telephone, telephone, telephone, …
## $ month          <fct> may, may, may, may, may, may, may, may, may, may, may, …
## $ day_of_week    <fct> mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, …
## $ campaign       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ previous       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome       <fct> nonexistent, nonexistent, nonexistent, nonexistent, non…
## $ emp.var.rate   <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons.price.idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons.conf.idx  <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m      <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr.employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y              <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

# Create train dataset and test dataset:
library(caret)
set.seed(101)
inTrain <- createDataPartition(bank$y, times = 1, p = 0.8, list = FALSE)
train <- bank[inTrain, ]
test <- bank[-inTrain, ]

# Run the logistic regression model:
logistic = glm(y ~ .,
               data = train,
               family = "binomial"(link="logit"))

# Result:
summary(logistic)

## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0863  -0.3928  -0.3195  -0.2622   2.9728  
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.231e+02  3.695e+01  -6.038 1.56e-09 ***
## age                  1.282e-03  2.340e-03   0.548 0.583753    
## jobblue-collar      -1.995e-01  6.403e-02  -3.115 0.001838 ** 
## jobentrepreneur     -5.379e-03  1.161e-01  -0.046 0.963051    
## jobhousemaid        -1.411e-01  1.412e-01  -0.999 0.317682    
## jobmanagement       -3.971e-02  8.320e-02  -0.477 0.633151    
## jobretired           2.134e-01  1.030e-01   2.072 0.038279 *  
## jobself-employed    -4.959e-02  1.124e-01  -0.441 0.659103    
## jobservices         -1.596e-01  8.039e-02  -1.985 0.047163 *  
## jobstudent           2.498e-01  1.076e-01   2.321 0.020291 *  
## jobtechnician        7.981e-06  6.149e-02   0.000 0.999896    
## jobunemployed       -1.304e-01  1.257e-01  -1.038 0.299287    
## jobunknown          -1.656e-01  2.284e-01  -0.725 0.468268    
## maritalmarried       6.079e-02  6.758e-02   0.899 0.368413    
## maritalsingle        1.163e-01  7.669e-02   1.516 0.129427    
## maritalunknown       3.952e-01  4.034e-01   0.980 0.327251    
## defaultunknown      -2.839e-01  6.452e-02  -4.401 1.08e-05 ***
## defaultyes          -7.803e+00  8.448e+01  -0.092 0.926400    
## housingunknown      -2.104e-02  1.307e-01  -0.161 0.872105    
## housingyes          -2.102e-02  4.037e-02  -0.521 0.602658    
## loanunknown                 NA         NA      NA       NA    
## loanyes             -1.204e-02  5.554e-02  -0.217 0.828387    
## contacttelephone    -7.077e-01  7.500e-02  -9.437  < 2e-16 ***
## monthaug             4.319e-01  1.196e-01   3.610 0.000307 ***
## monthdec             4.608e-01  2.134e-01   2.160 0.030784 *  
## monthjul             3.608e-02  9.301e-02   0.388 0.698063    
## monthjun            -6.542e-01  1.234e-01  -5.301 1.15e-07 ***
## monthmar             1.455e+00  1.452e-01  10.025  < 2e-16 ***
## monthmay            -4.583e-01  8.091e-02  -5.664 1.48e-08 ***
## monthnov            -4.288e-01  1.171e-01  -3.662 0.000250 ***
## monthoct             3.745e-02  1.499e-01   0.250 0.802736    
## monthsep             2.234e-01  1.760e-01   1.269 0.204377    
## day_of_weekmon      -2.408e-01  6.449e-02  -3.734 0.000188 ***
## day_of_weekthu       3.745e-02  6.201e-02   0.604 0.545873    
## day_of_weektue       3.028e-02  6.390e-02   0.474 0.635570    
## day_of_weekwed       1.143e-01  6.340e-02   1.802 0.071472 .  
## campaign            -5.183e-02  1.059e-02  -4.894 9.87e-07 ***
## previous             7.795e-02  5.893e-02   1.323 0.185932    
## poutcomenonexistent  5.648e-01  9.654e-02   5.851 4.89e-09 ***
## poutcomesuccess      1.724e+00  8.822e-02  19.547  < 2e-16 ***
## emp.var.rate        -1.457e+00  1.369e-01 -10.641  < 2e-16 ***
## cons.price.idx       2.022e+00  2.436e-01   8.298  < 2e-16 ***
## cons.conf.idx        3.052e-02  7.788e-03   3.918 8.91e-05 ***
## euribor3m            2.140e-01  1.281e-01   1.670 0.094937 .  
## nr.employed          6.206e-03  3.013e-03   2.060 0.039385 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 18172  on 32907  degrees of freedom
## AIC: 18260
## 
## Number of Fisher Scoring iterations: 9

First, we can see that variables with p-value smaller than 0.05 are statistically significant predictors, which means it would have impact on whether people choose to subscribe to the bank term deposit or not. Variables that have the lower p-value suggesting a strong association with the dependent variable y.

We can see that older age, higer education, contact over cellphone, and previous contact all seem to be highly predictive and positively correlated with a ‘yes’.

We also see thats jobs do not seem to have a significant impact. Similiary our socio-economics indicators do not turn out to be significant expect for nr.empolyed.

# Run the logistic regression model only with significant variables:
logistic_2 = glm(y ~ contact + month + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + nr.employed ,
               data = train,
               family = "binomial"(link="logit"))

# Result:
summary(logistic_2)

## 
## Call:
## glm(formula = y ~ contact + month + poutcome + emp.var.rate + 
##     cons.price.idx + cons.conf.idx + nr.employed, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9767  -0.3654  -0.3276  -0.2564   2.7986  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.670e+02  3.029e+01  -8.814  < 2e-16 ***
## contacttelephone    -7.346e-01  7.473e-02  -9.830  < 2e-16 ***
## monthaug             5.607e-01  1.170e-01   4.791 1.66e-06 ***
## monthdec             5.715e-01  2.039e-01   2.803  0.00506 ** 
## monthjul             7.442e-02  9.091e-02   0.819  0.41301    
## monthjun            -7.079e-01  1.217e-01  -5.817 5.99e-09 ***
## monthmar             1.591e+00  1.378e-01  11.545  < 2e-16 ***
## monthmay            -4.412e-01  7.801e-02  -5.656 1.55e-08 ***
## monthnov            -2.454e-01  9.247e-02  -2.654  0.00795 ** 
## monthoct             2.412e-01  1.235e-01   1.953  0.05082 .  
## monthsep             4.262e-01  1.552e-01   2.746  0.00603 ** 
## poutcomenonexistent  4.770e-01  6.167e-02   7.734 1.04e-14 ***
## poutcomesuccess      1.757e+00  8.719e-02  20.149  < 2e-16 ***
## emp.var.rate        -1.499e+00  1.364e-01 -10.985  < 2e-16 ***
## cons.price.idx       2.291e+00  2.179e-01  10.511  < 2e-16 ***
## cons.conf.idx        4.359e-02  5.396e-03   8.078 6.56e-16 ***
## nr.employed          1.006e-02  1.981e-03   5.078 3.82e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 18312  on 32934  degrees of freedom
## AIC: 18346
## 
## Number of Fisher Scoring iterations: 6

We use the anova() function to compare which model is better. Let’s say our null hypothesis is that second model is better than the first model. p < 0.05 would reject our hypothesis and in case p > 0.05, we’ll fail to reject the null hypothesis.

anova(logistic, logistic_2, test="Chisq")

## Analysis of Deviance Table
## 
## Model 1: y ~ age + job + marital + default + housing + loan + contact + 
##     month + day_of_week + campaign + previous + poutcome + emp.var.rate + 
##     cons.price.idx + cons.conf.idx + euribor3m + nr.employed
## Model 2: y ~ contact + month + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + nr.employed
##   Resid. Df Resid. Dev  Df Deviance  Pr(>Chi)    
## 1     32907      18172                           
## 2     32934      18312 -27  -139.88 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

With p < 0.05, this ANOVA test also corroborates the fact that the first model is better at prediction than our second model.

MODEL PREDICTION:

pred.train <- predict(logistic,test)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

pred.train <- ifelse(pred.train > 0.5,1,0)

# Mean of the true prediction 
mean(pred.train == test$y)

## [1] 0.8991138

t1 <- table(pred.train,test$y)

# Presicion and recall of the model
presicion <- t1[1,1]/(sum(t1[1,]))
recall <- t1[1,1]/(sum(t1[,1]))
presicion

## [1] 0.9028607

recall

## [1] 0.9931591

The F1 score (also F-score or F-measure) is a measure of a test’s accuracy. We will calcualte F1 score to see the model accuracy.

F1<- 2*presicion*recall/(presicion+recall)
F1

## [1] 0.9458597

GRAPHING THE PREDICTED PROBABILITY:

predicted_data <- data.frame(y_probability = logistic$fitted.values, y=train$y)

predicted_data <- predicted_data[order(predicted_data$y_probability, decreasing=FALSE),]
predicted_data$rank <- 1:nrow(predicted_data)

library(cowplot)
ggplot(data=predicted_data, aes(x=rank, y=y_probability)) + geom_point(aes(color=as.factor(y)), alpha=1, shape=4, stroke=2) + xlab("Index") + ylab("Predicted probability of people subscribing bank term deposit")

Bank Marketing Data Exploration and Analysis

Thanh Thuy An Tran

10/11/2021