library(gmodels)  # Cross Tables [CrossTable()]
library(ggplot2)
library(ggmosaic)  # Mosaic plot with ggplot [geom_mosaic()]
library(corrplot)  # Correlation plot [corrplot()]
library(ggpubr)  # Arranging ggplots together [ggarrange()]
library(cowplot)  # Arranging ggplots together [plot_grid()]
library(caret)  # ML [train(), confusionMatrix(), createDataPartition(), varImp(), trainControl()]
library(ROCR)  # Model performance [performance(), prediction()]
library(plotROC)  # ROC Curve with ggplot [geom_roc()]
library(pROC)  # AUC computation [auc()]
library(PRROC)  # AUPR computation [pr.curve()]
library(vcd)


library(rpart)  # Decision trees [rpart(), plotcp(), prune()]
library(rpart.plot)  # Decision trees plotting [rpart.plot()]
library(ranger)  # Optimized Random Forest [ranger()]
library(lightgbm)  # Light GBM [lgb.train()]
library(xgboost)  # XGBoost [xgb.DMatrix(), xgb.train()]
library(MLmetrics)  # Custom metrics (F1 score for example)
library(tidyverse)  # Data manipulation
# library(doMC) # Parallel processing

Load Data

bank = read.csv(file = "./bank-additional-full.csv", sep = ";", stringsAsFactors = F)

head(bank)

rows <- nrow(bank)

dim(bank)

## [1] 41188    21

names(bank)

##  [1] "age"            "job"            "marital"        "education"     
##  [5] "default"        "housing"        "loan"           "contact"       
##  [9] "month"          "day_of_week"    "duration"       "campaign"      
## [13] "pdays"          "previous"       "poutcome"       "emp.var.rate"  
## [17] "cons.price.idx" "cons.conf.idx"  "euribor3m"      "nr.employed"   
## [21] "y"

This is an unbalanced two-levels categorical variable, 88.7% of values taken are “no” (or “0”) and only 11.3% of the values are “yes” (or “1”). It is more natural to work with a 0/1 dependent variable

CrossTable(bank$y)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  41188 
## 
##  
##           |        no |       yes | 
##           |-----------|-----------|
##           |     36548 |      4640 | 
##           |     0.887 |     0.113 | 
##           |-----------|-----------|
## 
## 
## 
##

bank = bank %>%
    mutate(y = factor(if_else(y == "yes", "1", "0"), levels = c("0", "1")))

There are no missing values
However, according to the data documentation, “unknown” value means NA.

sum(is.na(bank))

## [1] 0

There are 12,718 unknown values in the dataset, let’s try to find out which variables suffer the most from those “missing values”.
6 features have at least 1 unknown value. Before deciding how to manage those missing values, we’ll study each variable and take a decision after visualisations. We can’t afford to delete 8,597 rows in our dataset, it’s more than 20% of our observations.

sum(bank == "unknown")

## [1] 12718

# summarise_all is a function from the dplyr package that applies a function to
# all columns in the data frame. In this case, the function is list(~sum(. ==
# 'unknown')), which uses the ~ notation to define an anonymous function that
# counts the number of times the value 'unknown' appears in each column. The
# output is a new data frame with a single row that summarizes the counts for
# each column.  gather is a function from the tidyr package that reshapes data
# from wide format to long format. The key parameter specifies the name of the
# new column that will contain the names of the original columns, and the value
# parameter specifies the name of the new column that will contain the counts.
# The resulting data frame has three columns: 'variable', 'nr_unknown', and the
# names of the original columns.  arrange is a function from the dplyr package
# that sorts rows based on one or more columns. In this case, -Count specifies
# that the rows should be sorted in descending order based on the values in the
# 'Count' column.

bank %>%
    summarise_all(list(~sum(. == "unknown"))) %>%
    gather(key = "Variable Name", value = "Unknown_Count") %>%
    arrange(-Unknown_Count)

Helper Functions

crosstable_f = function(df, x1, x2){
  # df: dataframe containing both columns to cross
  # var1, var2: columns to cross together.
  CrossTable(df[, x1], df[, x2],
             prop.r = T, # include raw percentages  
             prop.c = F,
             prop.t = F,
             prop.chisq = F,
             dnn = c(x1, x2)) # set names of columns
}


mosaic_theme <- theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
                      axis.text.y = element_blank(),
                      axis.ticks.y = element_blank())

fun_mosaic_plot <- function(data, x_var, y_var, x_label, y_label){
  
  data %>% 
    ggplot() +
    geom_mosaic(aes_string(x = paste0("product(", y_var, ",", x_var, ")"), fill = y_var)) +
    mosaic_theme +
    xlab(x_label) +
    ylab(y_label)
  
}

EDA

Univariate

Age

summary(bank$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.00   32.00   38.00   40.02   47.00   98.00

after the 60-years threshold, the relative frequency is higher when y = 1. In other words, we can say that elderly persons are more likely to subscribe to a term deposit

bank %>% 
  ggplot() + # begin plot construction
  aes(x = age) + # specify x axis = age
  geom_bar() + # plot a bar plot faceted on y
  facet_grid(y ~ .,
             scales = "free_y") + # scales vary across y, y-axis scales of each panel will be independent of each other
  scale_x_continuous(breaks = seq(0, 100, 5))

Elderly persons represent 8.92% of observations who accepted to subscribe to a term deposit, this proportion decreases to 1.36% for non subscribers.

# first creates the age_60 variable Then, count() is used to compute the
# frequency of each combination of age_60 and y.  Next, the data is grouped by
# y and nr_y is computed as the sum of n within each y group.

# Finally, relative_freq is computed as a percentage of n within each elder60-y
# group, and the results are ungrouped and selected for output.

bank %>%
    mutate(age_60 = if_else(age > 60, "1", "0")) %>%
    count(age_60, y) %>%
    group_by(y) %>%
    mutate(nr_y = sum(n)) %>%
    mutate(relative_freq = round(100 * n/nr_y, 2)) %>%
    ungroup() %>%
    select(age_60, y, n, relative_freq)

We can also slice the age feature at 30 years to make three easily interpretable classes : [0, 30[, [30, 60[ and [60, +Inf[. The minimum and maximum values are 17 and 98 but we can expect new observations outside this range. We’re replacing the continious variable “age” by this categorical variable.
We might lose some information from this continious-to-discrete transformation, but there wasn’t any clear pattern between years. Cutting into classes make the algorithms easier to interpret later.

bank = bank %>%
    mutate(age = if_else(age > 60, "high", if_else(age > 30, "mid", "low")))

45.5% of people over 60 years old subscribed a term deposit, which is a lot in comparison with younger individuals (15.2% for young adults (aged lower than 30) and only 9.4% for the remaining observations (aged between 30 and 60)).

crosstable_f(bank, "age", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  41188 
## 
##  
##              | y 
##          age |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##         high |       496 |       414 |       910 | 
##              |     0.545 |     0.455 |     0.022 | 
## -------------|-----------|-----------|-----------|
##          low |      6259 |      1124 |      7383 | 
##              |     0.848 |     0.152 |     0.179 | 
## -------------|-----------|-----------|-----------|
##          mid |     29793 |      3102 |     32895 | 
##              |     0.906 |     0.094 |     0.799 | 
## -------------|-----------|-----------|-----------|
## Column Total |     36548 |      4640 |     41188 | 
## -------------|-----------|-----------|-----------|
## 
##

Job

# types of jobs

table(bank$job)

## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##         10422          9254          1456          1060          2924 
##       retired self-employed      services       student    technician 
##          1720          1421          3969           875          6743 
##    unemployed       unknown 
##          1014           330

Majority are admin and blur-collar jobs. Rest have a similar count and 330 unknowns.
We can drop rows that have unknown in job as the counts are very few and probably wont add much information to our model

crosstable_f(bank, "job", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  41188 
## 
##  
##               | y 
##           job |         0 |         1 | Row Total | 
## --------------|-----------|-----------|-----------|
##        admin. |      9070 |      1352 |     10422 | 
##               |     0.870 |     0.130 |     0.253 | 
## --------------|-----------|-----------|-----------|
##   blue-collar |      8616 |       638 |      9254 | 
##               |     0.931 |     0.069 |     0.225 | 
## --------------|-----------|-----------|-----------|
##  entrepreneur |      1332 |       124 |      1456 | 
##               |     0.915 |     0.085 |     0.035 | 
## --------------|-----------|-----------|-----------|
##     housemaid |       954 |       106 |      1060 | 
##               |     0.900 |     0.100 |     0.026 | 
## --------------|-----------|-----------|-----------|
##    management |      2596 |       328 |      2924 | 
##               |     0.888 |     0.112 |     0.071 | 
## --------------|-----------|-----------|-----------|
##       retired |      1286 |       434 |      1720 | 
##               |     0.748 |     0.252 |     0.042 | 
## --------------|-----------|-----------|-----------|
## self-employed |      1272 |       149 |      1421 | 
##               |     0.895 |     0.105 |     0.035 | 
## --------------|-----------|-----------|-----------|
##      services |      3646 |       323 |      3969 | 
##               |     0.919 |     0.081 |     0.096 | 
## --------------|-----------|-----------|-----------|
##       student |       600 |       275 |       875 | 
##               |     0.686 |     0.314 |     0.021 | 
## --------------|-----------|-----------|-----------|
##    technician |      6013 |       730 |      6743 | 
##               |     0.892 |     0.108 |     0.164 | 
## --------------|-----------|-----------|-----------|
##    unemployed |       870 |       144 |      1014 | 
##               |     0.858 |     0.142 |     0.025 | 
## --------------|-----------|-----------|-----------|
##       unknown |       293 |        37 |       330 | 
##               |     0.888 |     0.112 |     0.008 | 
## --------------|-----------|-----------|-----------|
##  Column Total |     36548 |      4640 |     41188 | 
## --------------|-----------|-----------|-----------|
## 
##

bank = bank %>%
    filter(job != "unknown")


head(bank)

Students (31.4%), retired people (25.2%) and unemployed (14.2%) categories show the best relative frequencies of term deposit subscription. Other classes range between 6.9% (blue-collar) and 13.0% (admin.).

fun_mosaic_plot(bank, "job", "y", "Job", "Proportion")

Marital Status

Removing unknown as it has low count and will add negligiable predictive power

crosstable_f(bank, "marital", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40858 
## 
##  
##              | y 
##      marital |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##     divorced |      4126 |       473 |      4599 | 
##              |     0.897 |     0.103 |     0.113 | 
## -------------|-----------|-----------|-----------|
##      married |     22178 |      2516 |     24694 | 
##              |     0.898 |     0.102 |     0.604 | 
## -------------|-----------|-----------|-----------|
##       single |      9889 |      1605 |     11494 | 
##              |     0.860 |     0.140 |     0.281 | 
## -------------|-----------|-----------|-----------|
##      unknown |        62 |         9 |        71 | 
##              |     0.873 |     0.127 |     0.002 | 
## -------------|-----------|-----------|-----------|
## Column Total |     36255 |      4603 |     40858 | 
## -------------|-----------|-----------|-----------|
## 
##

bank = bank %>%
    filter(marital != "unknown")

fun_mosaic_plot(bank, "marital", "y", "Job", "Proportion")

Education

The illiterate category has not enough observations to be statisticaly meaningful. We can’t discriminate illiterate people by using a pool made of 18 individuals only. Hence, those rows will be deleted from the dataset.

crosstable_f(bank, "education", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40787 
## 
##  
##                     | y 
##           education |         0 |         1 | Row Total | 
## --------------------|-----------|-----------|-----------|
##            basic.4y |      3695 |       423 |      4118 | 
##                     |     0.897 |     0.103 |     0.101 | 
## --------------------|-----------|-----------|-----------|
##            basic.6y |      2077 |       187 |      2264 | 
##                     |     0.917 |     0.083 |     0.056 | 
## --------------------|-----------|-----------|-----------|
##            basic.9y |      5536 |       470 |      6006 | 
##                     |     0.922 |     0.078 |     0.147 | 
## --------------------|-----------|-----------|-----------|
##         high.school |      8436 |      1028 |      9464 | 
##                     |     0.891 |     0.109 |     0.232 | 
## --------------------|-----------|-----------|-----------|
##          illiterate |        14 |         4 |        18 | 
##                     |     0.778 |     0.222 |     0.000 | 
## --------------------|-----------|-----------|-----------|
## professional.course |      4631 |       594 |      5225 | 
##                     |     0.886 |     0.114 |     0.128 | 
## --------------------|-----------|-----------|-----------|
##   university.degree |     10442 |      1654 |     12096 | 
##                     |     0.863 |     0.137 |     0.297 | 
## --------------------|-----------|-----------|-----------|
##             unknown |      1362 |       234 |      1596 | 
##                     |     0.853 |     0.147 |     0.039 | 
## --------------------|-----------|-----------|-----------|
##        Column Total |     36193 |      4594 |     40787 | 
## --------------------|-----------|-----------|-----------|
## 
##

bank = bank %>%
    filter(education != "illiterate")

Among the 1,596 rows containing the “unknown” value, 234 of them subscribed to a term deposit. This is around 5% of the total group of subscribers. Since we’re facing a very unbalanced dependent variable situation, we can not afford to discard those rows. Because this category has the highest relative frequency of “y = 1” (14.7%), we’re going to add them in the “university.degree” level. It has the second highest “y = 1” relative frequency (13.7%).
It appears that a positive correlation between the number of years of education and the odds to subscribe to a term deposit exists.

bank = bank %>%
    mutate(education = recode(education, unknown = "university.degree"))


fun_mosaic_plot(bank, "education", "y", "Job", "Proportion")

bank %>%
    ggplot() + aes(x = education, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")

Default

This feature is certainly not usable. Only 3 individuals replied “yes” to the question “Do you have credit in default?”. People either answered “no” (79.3%) or don’t even reply (20.7%), which gives zero information in our case. This variable is removed from the dataset.

crosstable_f(bank, "default", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40769 
## 
##  
##              | y 
##      default |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |     28182 |      4155 |     32337 | 
##              |     0.872 |     0.128 |     0.793 | 
## -------------|-----------|-----------|-----------|
##      unknown |      7994 |       435 |      8429 | 
##              |     0.948 |     0.052 |     0.207 | 
## -------------|-----------|-----------|-----------|
##          yes |         3 |         0 |         3 | 
##              |     1.000 |     0.000 |     0.000 | 
## -------------|-----------|-----------|-----------|
## Column Total |     36179 |      4590 |     40769 | 
## -------------|-----------|-----------|-----------|
## 
##

bank = bank %>%
    select(-default)

Housing

One way to assess the strength and significance of the association between the response and a categorical predictor is by using a chi-squared test.
The chi-squared test helps us to determine whether there is a statistically significant difference in the distribution of the response variable across different categories of the predictor variable. In other words, it tests whether the proportions of the response variable in different levels of the categorical predictor variable are significantly different from each other
The p-value associated to the Chi-squared test equals to 0.065, which is higher than a 0.05-threshold. So, for a confidence level of 95%, there’s no association between the dependent variable y and our feature housing. We’re removing it from the dataset.

# Stacked bar chart for categorical variable and binary response variable
bank %>%
    ggplot(aes(x = y, fill = housing)) + geom_bar() + labs(x = "Response Variable",
    y = "Count") + scale_fill_discrete(name = "Housing Loan")

crosstable_f(bank, "housing", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40769 
## 
##  
##              | y 
##      housing |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |     16416 |      2003 |     18419 | 
##              |     0.891 |     0.109 |     0.452 | 
## -------------|-----------|-----------|-----------|
##      unknown |       877 |       107 |       984 | 
##              |     0.891 |     0.109 |     0.024 | 
## -------------|-----------|-----------|-----------|
##          yes |     18886 |      2480 |     21366 | 
##              |     0.884 |     0.116 |     0.524 | 
## -------------|-----------|-----------|-----------|
## Column Total |     36179 |      4590 |     40769 | 
## -------------|-----------|-----------|-----------|
## 
##

chisq.test(bank$housing, bank$y)

## 
##  Pearson's Chi-squared test
## 
## data:  bank$housing and bank$y
## X-squared = 5.4627, df = 2, p-value = 0.06513

bank = bank %>%
    select(-housing)

Loan

The p-value associated to the Chi-squared test equals to 0.648, which is higher than a 0.01-threshold. So, for a confidence level of 99%, there’s no association between the dependent variable y and our feature loan. We’re also removing it from the dataset

crosstable_f(bank, "loan", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40769 
## 
##  
##              | y 
##         loan |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |     29799 |      3806 |     33605 | 
##              |     0.887 |     0.113 |     0.824 | 
## -------------|-----------|-----------|-----------|
##      unknown |       877 |       107 |       984 | 
##              |     0.891 |     0.109 |     0.024 | 
## -------------|-----------|-----------|-----------|
##          yes |      5503 |       677 |      6180 | 
##              |     0.890 |     0.110 |     0.152 | 
## -------------|-----------|-----------|-----------|
## Column Total |     36179 |      4590 |     40769 | 
## -------------|-----------|-----------|-----------|
## 
##

chisq.test(bank$loan, bank$y)

## 
##  Pearson's Chi-squared test
## 
## data:  bank$loan and bank$y
## X-squared = 0.86841, df = 2, p-value = 0.6478

bank = bank %>%
    select(-loan)

Contact

This feature is really interesting, 14.7% of cellular responders subscribed to a term deposit while only 5.2% of telephone responders did.

crosstable_f(bank, "contact", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40769 
## 
##  
##              | y 
##      contact |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##     cellular |     22098 |      3815 |     25913 | 
##              |     0.853 |     0.147 |     0.636 | 
## -------------|-----------|-----------|-----------|
##    telephone |     14081 |       775 |     14856 | 
##              |     0.948 |     0.052 |     0.364 | 
## -------------|-----------|-----------|-----------|
## Column Total |     36179 |      4590 |     40769 | 
## -------------|-----------|-----------|-----------|
## 
##

fun_mosaic_plot(bank, "contact", "y", "Job", "Proportion")

bank %>%
    ggplot() + aes(x = contact, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")

Month

First of all, we can notice that no contact has been made during January and February. The highest spike occurs during May, with 33.4% of total contacts, but it has the worst ratio of subscribers over persons contacted (6.5%). Every month with a very low frequency of contact (march, september, october and december) shows very good results (between 44% and 51% of subscribers). December aside, there are enough observations to conclude this isn’t pure luck, so this feature will probably be very important in models

crosstable_f(bank, "month", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40769 
## 
##  
##              | y 
##        month |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##          apr |      2082 |       536 |      2618 | 
##              |     0.795 |     0.205 |     0.064 | 
## -------------|-----------|-----------|-----------|
##          aug |      5459 |       644 |      6103 | 
##              |     0.894 |     0.106 |     0.150 | 
## -------------|-----------|-----------|-----------|
##          dec |        92 |        88 |       180 | 
##              |     0.511 |     0.489 |     0.004 | 
## -------------|-----------|-----------|-----------|
##          jul |      6471 |       642 |      7113 | 
##              |     0.910 |     0.090 |     0.174 | 
## -------------|-----------|-----------|-----------|
##          jun |      4697 |       548 |      5245 | 
##              |     0.896 |     0.104 |     0.129 | 
## -------------|-----------|-----------|-----------|
##          mar |       267 |       274 |       541 | 
##              |     0.494 |     0.506 |     0.013 | 
## -------------|-----------|-----------|-----------|
##          may |     12734 |       882 |     13616 | 
##              |     0.935 |     0.065 |     0.334 | 
## -------------|-----------|-----------|-----------|
##          nov |      3672 |       412 |      4084 | 
##              |     0.899 |     0.101 |     0.100 | 
## -------------|-----------|-----------|-----------|
##          oct |       396 |       311 |       707 | 
##              |     0.560 |     0.440 |     0.017 | 
## -------------|-----------|-----------|-----------|
##          sep |       309 |       253 |       562 | 
##              |     0.550 |     0.450 |     0.014 | 
## -------------|-----------|-----------|-----------|
## Column Total |     36179 |      4590 |     40769 | 
## -------------|-----------|-----------|-----------|
## 
##

bank %>%
    ggplot() + aes(x = month, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")

Day of the Week

There does not seem to be any dependence of day of week on our response variable.
The proportions are nearly similar
We can drop this variable

crosstable_f(bank, "day_of_week", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  40769 
## 
##  
##              | y 
##  day_of_week |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##          fri |      6936 |       839 |      7775 | 
##              |     0.892 |     0.108 |     0.191 | 
## -------------|-----------|-----------|-----------|
##          mon |      7578 |       841 |      8419 | 
##              |     0.900 |     0.100 |     0.207 | 
## -------------|-----------|-----------|-----------|
##          thu |      7493 |      1031 |      8524 | 
##              |     0.879 |     0.121 |     0.209 | 
## -------------|-----------|-----------|-----------|
##          tue |      7056 |       945 |      8001 | 
##              |     0.882 |     0.118 |     0.196 | 
## -------------|-----------|-----------|-----------|
##          wed |      7116 |       934 |      8050 | 
##              |     0.884 |     0.116 |     0.197 | 
## -------------|-----------|-----------|-----------|
## Column Total |     36179 |      4590 |     40769 | 
## -------------|-----------|-----------|-----------|
## 
##

bank %>%
    ggplot() + aes(x = day_of_week, y = ..count../rows, fill = y) + geom_bar() +
    ylab("relative frequency")

Duration

ggplot(bank, aes(x = duration, fill = y)) + geom_density(alpha = 0.5) + ggtitle("Distribution of y and duration") +
    xlab("Duration") + ylab("Density")

Campaign

Remove > 10 -> unlikely that a customer is called more than 10 times.

# How many times was the client contacted during this campaign?


bank %>%
    ggplot() + aes(x = campaign, y = y, fill = y) + geom_boxplot() + ylab("relative frequency")

bank %>% 
  ggplot() + # begin plot construction
  aes(x = campaign) + # specify x axis = campaign
  geom_bar() + # plot a bar plot faceted on y
  facet_grid(y ~ .,
             scales = "free_y") + # scales vary across y, y-axis scales of each panel will be independent of each other
  scale_x_continuous(breaks = seq(0, 100, 5))

bank = bank %>%
    filter(campaign <= 10)

bank %>% 
  ggplot() + # begin plot construction
  aes(x = campaign) + # specify x axis = campaign
  geom_bar() + # plot a bar plot faceted on y
  facet_grid(y ~ .,
             scales = "free_y") + # scales vary across y, y-axis scales of each panel will be independent of each other
  scale_x_continuous(breaks = seq(0, 100, 5))

- We can see that more number of times a customer is contacted in a campaign, the less likely it is they will subscribe for a term deposit

bank %>%
    ggplot() + aes(x = campaign, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")

fun_mosaic_plot(bank, "campaign", "y", "Campaign", "Proportion")

Pdays

This is the number of days that passed by after the client was last contacted from a previous campaign. 999 value means the client wasn’t previously contacted. Let’s make a dummy out of it.
Clients who haven’t been contacted in a previous campaign will be labeled “0” in the pdays_dummy variable.
Interesting to note that, Recontacting a client after a previous campaign seems to highly increase the odds of subscription

# number of days that passed by after the client was last contacted from a
# previous campaign

bank_counts <- table(bank$pdays)
c <- data.frame(bank_counts)
c

bank = bank %>%
    mutate(pdays_dummy = if_else(pdays == 999, "0", "1")) %>%
    select(-pdays)


fun_mosaic_plot(bank, "pdays_dummy", "y", "Previous Campaign Contact", "Proportion")

This is similar to the above plot, where we see that if the person was contacted previously –> they ended up subscribing for a term deposit in the current campaign.

# previous: number of contacts performed before this campaign and for this
# client


fun_mosaic_plot(bank, "previous", "y", "Number of Contacts in Previous Campaign",
    "Proportion")

bank %>%
    ggplot() + aes(x = previous, y = ..count../rows, fill = y) + geom_bar() + ylab("relative frequency")

Poutcome

crosstable_f(bank, "poutcome", "y")

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  39912 
## 
##  
##              | y 
##     poutcome |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##      failure |      3616 |       595 |      4211 | 
##              |     0.859 |     0.141 |     0.106 | 
## -------------|-----------|-----------|-----------|
##  nonexistent |     31268 |      3085 |     34353 | 
##              |     0.910 |     0.090 |     0.861 | 
## -------------|-----------|-----------|-----------|
##      success |       464 |       884 |      1348 | 
##              |     0.344 |     0.656 |     0.034 | 
## -------------|-----------|-----------|-----------|
## Column Total |     35348 |      4564 |     39912 | 
## -------------|-----------|-----------|-----------|
## 
##

Bivariate - Financial Covariates

3 month rate is the interest rate at which euro interbank term deposits are offered by one prime bank to another prime bank within the eurozone. It is an important benchmark used in financial markets and is used as a reference rate for various financial instruments. In the context of a person subscribing for a term deposit, a higher euribor 3 month rate generally indicates that interest rates are higher, and the returns on the term deposit will also be higher. This may make the term deposit more attractive to potential investors, including individuals who may be considering subscribing for a term deposit. Conversely, a lower euribor 3 month rate may make term deposits less attractive, as the returns on the investment would be lower.
is an economic indicator that measures changes in the number of employed people in a given population over a certain period, usually a quarter. The rate is calculated as the percentage change in the number of employed people from one period to another. The employment variation rate can have an impact on a person’s decision to subscribe for a term deposit in several ways. For example, if the employment variation rate is high, it may indicate a strong and growing economy, which can lead to higher confidence among consumers and increased spending. This increased spending may include investments in term deposits as a way to save money and earn interest.

library(reshape2)



# Select the variables to include in the correlation matrix
vars <- bank %>%
    select(emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed)

# Calculate the correlation matrix
cor_matrix <- cor(vars)

# Convert the correlation matrix to a data frame
cor_df <- cor_matrix %>%
    as.data.frame() %>%
    rownames_to_column(var = "var1") %>%
    pivot_longer(cols = -var1, names_to = "var2", values_to = "correlation")

# Plot the correlation heatmap
ggplot(cor_df, aes(var1, var2)) + geom_tile(aes(fill = correlation)) + scale_fill_gradient2(low = "blue",
    mid = "white", high = "red", midpoint = 0) + geom_text(aes(label = round(correlation,
    2)), color = "black", size = 3) + theme_minimal() + theme(axis.text.x = element_text(angle = 90)) +
    coord_fixed() + labs(title = "Correlation Heatmap", x = "", y = "")

pivot_longer function is used to reshape the data frame from a wide format to a long format. It takes the columns emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed and stacks them into a single column named variable, while creating a new column named value to store the actual values.
facet_wrap function is used to create a faceted plot. It takes the variable column as the facetting variable and sets the scales argument to “free”, so that the scales of each panel can be adjusted independently.
geom_histogram is used to add a histogram layer to the plot.
scale_fill_manual is used to set the fill colors for the two levels of the y variable.
theme is used to customize the appearance of the plot. In this case, it is used to tilt the x axis labels by 90 degrees.

bank %>% 
  pivot_longer(cols = c(emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed),
               names_to = "variable") %>% 
  ggplot(aes(x = value, fill = y)) +
  facet_wrap(~variable, scales = "free") +
  geom_histogram(alpha = 0.5, position = "identity", bins = 20) +
  xlab("Variable Value") +
  ylab("Count") +
  ggtitle("Histogram of Financial Variables against Response") +
  scale_fill_manual(values = c("#999999", "#E69F00")) + # custom colors for the fill
  theme(axis.text.x = element_text(angle = 90))

library(tidyverse)

# Scatterplot matrix
plot(bank %>%
    select(emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed))

# Boxplot of employment variation rate by response
ggplot(bank, aes(x = y, y = emp.var.rate)) + geom_boxplot()

# Density plot of consumer price index by response
ggplot(bank, aes(x = cons.price.idx, fill = y)) + geom_density(alpha = 0.5)

# Histogram of consumer confidence index by response
ggplot(bank, aes(x = cons.conf.idx, fill = y)) + geom_histogram(alpha = 0.5, bins = 30)

# Density plot of euribor 3 month rate by response
ggplot(bank, aes(x = euribor3m, fill = y)) + geom_density(alpha = 0.5)

# Boxplot of number of employees by response
ggplot(bank, aes(x = y, y = nr.employed)) + geom_boxplot()

library(ggplot2)
library(dplyr)

# Scatter plots for numeric variables
bank %>%
    ggplot(aes(x = emp.var.rate, y = nr.employed)) + geom_point() + labs(x = "Employment Variation Rate",
    y = "Number of Employees")

bank %>%
    ggplot(aes(x = cons.price.idx, y = euribor3m)) + geom_point() + labs(x = "Consumer Price Index",
    y = "Euribor 3 Month Rate")

bank %>%
    ggplot(aes(x = cons.conf.idx, y = nr.employed)) + geom_point() + labs(x = "Consumer Confidence Index",
    y = "Number of Employees")

# Boxplots for numeric variables and binary response variable
bank %>%
    ggplot(aes(x = y, y = emp.var.rate)) + geom_boxplot() + labs(x = "Response Variable",
    y = "Employment Variation Rate")

bank %>%
    ggplot(aes(x = y, y = cons.price.idx)) + geom_boxplot() + labs(x = "Response Variable",
    y = "Consumer Price Index")

bank %>%
    ggplot(aes(x = y, y = cons.conf.idx)) + geom_boxplot() + labs(x = "Response Variable",
    y = "Consumer Confidence Index")

bank %>%
    ggplot(aes(x = y, y = euribor3m)) + geom_boxplot() + labs(x = "Response Variable",
    y = "Euribor 3 Month Rate")

bank %>%
    ggplot(aes(x = y, y = nr.employed)) + geom_boxplot() + labs(x = "Response Variable",
    y = "Number of Employees")

Cramer’s V

Dependence between y and categorical predictors

library(rcompanion)

cat_vars = c(2, 3, 4, 5, 6, 7, 8, 9, 10, 15)

y_cV <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Variable", "Pvalue",
    "CramerV", "DF"))
for (var in cat_vars) {
    ct <- xtabs(~bank[, var] + y, data = bank)
    cV <- cramerV(ct)
    y_cV[nrow(y_cV) + 1, 1] <- names(bank)[var]
    y_cV[nrow(y_cV), 2] <- round(chisq.test(ct)$p.value, 4)
    y_cV[nrow(y_cV), 3] <- cV
    y_cV[nrow(y_cV), 4] <- min(length(unique(bank[, var])) - 1, length(unique(bank)) -
        1)
}

y_cV <- y_cV[order(y_cV$CramerV, decreasing = TRUE), ]
y_cV

# y is independent of housing and loan

# Create a bar plot

ggplot(y_cV[c("Variable", "CramerV")], aes(x = Variable, y = CramerV, fill = CramerV)) +
    geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + geom_col()
    geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + +
    geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + #scale_fill_gradient(low
    geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + = 'white',
    geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + high =
    geom_col() + #scale_fill_gradient(low = 'white', high = 'red') + 'red') +
ggtitle("Cramer V for different variables") + xlab("Variables") + ylab("Cramer V vs Response") +
    theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

Dependence between pairs of categorical predictors

cat_vars2 = c(2, 3, 4, 5, 8, 9, 10, 15)
cat_cV <- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("Variable1", "Variable2",
    "CramerV", "DF"))
for (var1 in cat_vars2) {
    for (var2 in cat_vars2) {
        if (var1 == var2) {
            break
        } else {
            ct <- xtabs(~bank[, var1] + bank[, var2], data = bank)
            cV <- cramerV(ct)
            cat_cV[nrow(cat_cV) + 1, 1] <- names(bank)[var1]
            cat_cV[nrow(cat_cV), 2] <- names(bank)[var2]
            cat_cV[nrow(cat_cV), 3] <- cV
            cat_cV[nrow(cat_cV), 4] <- min(length(unique(bank[, var1])) - 1, length(unique(bank[,
                var2])) - 1)
        }
    }
}

cat_cV <- cat_cV[order(cat_cV$CramerV, decreasing = TRUE), ]
cat_cV

Model Building

Stratified Split

library(caret)

# Split the dataset into training (60%), testing (20%), and validation (20%)
# sets
set.seed(123)  # For reproducibility
trainIndex <- createDataPartition(bank$y, p = 0.6, list = FALSE, times = 1)
train_data <- bank[trainIndex, ]

test <- bank[-trainIndex, ]

testIndex <- createDataPartition(test$y, p = 0.5, list = FALSE, times = 1)
test_data <- test[-testIndex, ]
val_data <- test[testIndex, ]

# Check the class distribution in each dataset
prop.table(table(bank$y))

## 
##         0         1 
## 0.8856484 0.1143516

prop.table(table(train_data$y))

## 
##         0         1 
## 0.8856272 0.1143728

prop.table(table(test_data$y))

## 
##         0         1 
## 0.8857286 0.1142714

prop.table(table(val_data$y))

## 
##        0        1 
## 0.885632 0.114368

nrow(train_data)

## [1] 23948

nrow(test_data)

## [1] 7981

nrow(val_data)

## [1] 7983

Final Project

Angad

2023-04-14

Load Data

Helper Functions

EDA

Univariate

Age

Job

Marital Status

Education

Default

Housing

Loan

Contact

Month

Day of the Week

Duration

Campaign

Pdays

Previous

Poutcome

Bivariate - Financial Covariates

Cramer’s V

Dependence between y and categorical predictors

Dependence between pairs of categorical predictors

Model Building

Stratified Split