1 Load libraries

library(tidyverse)
library(skimr)
library(GGally)
library(janitor)
library(pROC)

options(dplyr.summarise.inform = FALSE)
theme_set(theme_minimal())

2 Importing Data from file

Using the Bank Marketing dataset (bank-additional-full.csv) from UCI ML Repository.

library(tidyverse)

# Loading downloaded dataset 
file_path <- "/Users/michaelrobinson/Downloads/bank+marketing/bank-additional/bank-additional-full.csv"

# Reading the dataset 
bank <- read_csv2(file_path)
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 41188 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (12): job, marital, education, default, housing, loan, contact, month, d...
## dbl  (5): age, duration, campaign, pdays, previous
## num  (4): emp.var.rate, cons.price.idx, cons.conf.idx, nr.employed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Taking a look at the data
glimpse(bank)
## Rows: 41,188
## Columns: 21
## $ age            <dbl> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job            <chr> "housemaid", "services", "services", "admin.", "service…
## $ marital        <chr> "married", "married", "married", "married", "married", …
## $ education      <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
## $ default        <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
## $ housing        <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
## $ loan           <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
## $ contact        <chr> "telephone", "telephone", "telephone", "telephone", "te…
## $ month          <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
## $ day_of_week    <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
## $ duration       <dbl> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays          <dbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome       <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
## $ emp.var.rate   <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,…
## $ cons.price.idx <dbl> 93994, 93994, 93994, 93994, 93994, 93994, 93994, 93994,…
## $ cons.conf.idx  <dbl> -364, -364, -364, -364, -364, -364, -364, -364, -364, -…
## $ euribor3m      <chr> "4.857", "4.857", "4.857", "4.857", "4.857", "4.857", "…
## $ nr.employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y              <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
summary(bank)
##       age            job              marital           education        
##  Min.   :17.00   Length:41188       Length:41188       Length:41188      
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.02                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :98.00                                                           
##    default            housing              loan             contact         
##  Length:41188       Length:41188       Length:41188       Length:41188      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:41188       Length:41188       Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 102.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 180.0   Median : 2.000  
##                                        Mean   : 258.3   Mean   : 2.568  
##                                        3rd Qu.: 319.0   3rd Qu.: 3.000  
##                                        Max.   :4918.0   Max.   :56.000  
##      pdays          previous       poutcome          emp.var.rate     
##  Min.   :  0.0   Min.   :0.000   Length:41188       Min.   :-34.0000  
##  1st Qu.:999.0   1st Qu.:0.000   Class :character   1st Qu.:-18.0000  
##  Median :999.0   Median :0.000   Mode  :character   Median : 11.0000  
##  Mean   :962.5   Mean   :0.173                      Mean   :  0.9316  
##  3rd Qu.:999.0   3rd Qu.:0.000                      3rd Qu.: 14.0000  
##  Max.   :999.0   Max.   :7.000                      Max.   : 14.0000  
##  cons.price.idx  cons.conf.idx     euribor3m          nr.employed   
##  Min.   :  932   Min.   :-508.0   Length:41188       Min.   : 5191  
##  1st Qu.:92893   1st Qu.:-427.0   Class :character   1st Qu.:50175  
##  Median :93749   Median :-403.0   Mode  :character   Median :50991  
##  Mean   :85475   Mean   :-365.7                      Mean   :42865  
##  3rd Qu.:93994   3rd Qu.:-361.0                      3rd Qu.:52281  
##  Max.   :94767   Max.   : -33.0                      Max.   :52281  
##       y            
##  Length:41188      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

2.1 Check Leakage and Target

We care about the outcome in the column labeled y, it tells us if the customer said yes or no to opening a term deposit. The column labeled duration, shows how long phone calls lasted. Call length are only known after calls are completed, therefore its not used in predicting coustomers answer ahead of time. this won’t be used for building models, but will be look at duringanalysis to see what patterns it shows.

table(bank$y)
## 
##    no   yes 
## 36548  4640
summary(bank$duration)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   102.0   180.0   258.3   319.0  4918.0

3 Quality of Data Overview

bank_na <- bank %>% mutate(across(where(is.character), ~na_if(.x, "unknown")))

skimr::skim(bank_na)
Data summary
Name bank_na
Number of rows 41188
Number of columns 21
_______________________
Column type frequency:
character 12
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
job 330 0.99 6 13 0 11 0
marital 80 1.00 6 8 0 3 0
education 1731 0.96 8 19 0 7 0
default 8597 0.79 2 3 0 2 0
housing 990 0.98 2 3 0 2 0
loan 990 0.98 2 3 0 2 0
contact 0 1.00 8 9 0 2 0
month 0 1.00 3 3 0 10 0
day_of_week 0 1.00 3 3 0 5 0
poutcome 0 1.00 7 11 0 3 0
euribor3m 0 1.00 1 5 0 316 0
y 0 1.00 2 3 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 40.02 10.42 17 32 38 47 98 ▅▇▃▁▁
duration 0 1 258.29 259.28 0 102 180 319 4918 ▇▁▁▁▁
campaign 0 1 2.57 2.77 1 1 2 3 56 ▇▁▁▁▁
pdays 0 1 962.48 186.91 0 999 999 999 999 ▁▁▁▁▇
previous 0 1 0.17 0.49 0 0 0 0 7 ▇▁▁▁▁
emp.var.rate 0 1 0.93 15.58 -34 -18 11 14 14 ▁▃▁▁▇
cons.price.idx 0 1 85475.22 26234.18 932 92893 93749 93994 94767 ▁▁▁▁▇
cons.conf.idx 0 1 -365.67 119.10 -508 -427 -403 -361 -33 ▇▆▁▁▂
nr.employed 0 1 42864.89 18170.20 5191 50175 50991 52281 52281 ▂▁▁▁▇
n_counts <- bank_na %>% summarise(across(everything(), ~sum(is.na(.x))))
t(n_counts)
##                [,1]
## age               0
## job             330
## marital          80
## education      1731
## default        8597
## housing         990
## loan            990
## contact           0
## month             0
## day_of_week       0
## duration          0
## campaign          0
## pdays             0
## previous          0
## poutcome          0
## emp.var.rate      0
## cons.price.idx    0
## cons.conf.idx     0
## euribor3m         0
## nr.employed       0
## y                 0

3.0.1 Duplicates & consistency

dup_rows <- bank %>% add_count(across(everything()), name = "dupe_n") %>% filter(dupe_n > 1)
n_distinct_rows <- nrow(distinct(bank))
list(total_rows = nrow(bank), distinct_rows = n_distinct_rows, duplicate_rows = nrow(bank) - n_distinct_rows) 
## $total_rows
## [1] 41188
## 
## $distinct_rows
## [1] 41176
## 
## $duplicate_rows
## [1] 12

3.0.2 check for Consistency

issues <- list(
  negative_or_zero_duration = bank %>% filter(!is.na(duration) & duration <= 0) %>% nrow(),
  extreme_campaign_counts    = bank %>% filter(!is.na(campaign) & campaign < 1) %>% nrow(),
  pdays_special_999          = bank %>% filter(pdays == 999) %>% nrow(),
  negative_balances          = if ("emp.var.rate" %in% names(bank)) NA_integer_ else NA_integer_
)
issues
## $negative_or_zero_duration
## [1] 4
## 
## $extreme_campaign_counts
## [1] 0
## 
## $pdays_special_999
## [1] 39673
## 
## $negative_balances
## [1] NA

4 Distributions

numeric_vars <- bank %>% select(where(is.double) | where(is.integer))
numeric_vars %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  geom_histogram(bins = 30) +
  labs(title = "Numeric feature distributions")

numeric_vars %>%
  summarise(across(
    everything(),
    list(
      mean   = ~mean(.x, na.rm = TRUE),
      median = ~median(.x, na.rm = TRUE),
      sd     = ~sd(.x, na.rm = TRUE),
      q1     = ~quantile(.x, 0.25, na.rm = TRUE),
      q3     = ~quantile(.x, 0.75, na.rm = TRUE)
    )
  )) %>%
  pivot_longer(everything())
## # A tibble: 45 × 2
##    name            value
##    <chr>           <dbl>
##  1 age_mean         40.0
##  2 age_median       38  
##  3 age_sd           10.4
##  4 age_q1           32  
##  5 age_q3           47  
##  6 duration_mean   258. 
##  7 duration_median 180  
##  8 duration_sd     259. 
##  9 duration_q1     102  
## 10 duration_q3     319  
## # ℹ 35 more rows
numeric_vars %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot(outlier.alpha = 0.4) +
  coord_flip() +
  labs(title = "Boxplots to visualize outliers")

5 Categorical Variables

cat_vars <- bank %>% select(where(is.character))
cat_vars %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = fct_infreq(value))) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  geom_bar() + coord_flip() +
  labs(title = "Categorical feature distributions", x = "Level", y = "Count")

5.0.1 Numeric correlations

corr_df <- bank %>% select(where(is.numeric), -duration)
if (!is.null(corr_df) && ncol(corr_df) > 1) {
  GGally::ggcorr(corr_df, label = TRUE, label_round = 2, hjust = 0.9, size = 3) +
    labs(title = "Correlation matrix (numeric features only, excluding duration)")
}

5.0.2 Target relationships

bank %>%
  select(where(is.numeric), y) %>%
  pivot_longer(-y, names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = y, y = value)) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  geom_boxplot() +
  labs(title = "Numeric features vs. target (y)")

cat_vs_target <- cat_vars %>%
  mutate(y = bank$y) %>%
  pivot_longer(-y, names_to = "variable", values_to = "value") %>%
  count(variable, value, y) %>% group_by(variable) %>%
  mutate(prop = n / sum(n))

ggplot(cat_vs_target, aes(x = value, y = prop, fill = y)) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  geom_col(position = "fill") +
  coord_flip() +
  labs(title = "Categorical features vs. target (proportional)", y = "Proportion")

7 Check for Imbalance

target_rate <- mean(bank$y == "yes")
list(positive_rate = target_rate, counts = table(bank$y))
## $positive_rate
## [1] 0.1126542
## 
## $counts
## 
##    no   yes 
## 36548  4640

8 Example Train/Test

This is an example how the data is prepared for modeling. the duration column is removed to avoid leakage, the target variable y is set as yes/no, and all the other features are converted into numbers using.

set.seed(42)

bank_model <- bank %>% select(-duration) 
bank_model$y <- factor(bank_model$y, levels = c("no", "yes"))

x <- model.matrix(y ~ . , data = bank_model)[,-1]  
y <- bank_model$y

dim(x)
## [1] 41188   366
table(y)
## y
##    no   yes 
## 36548  4640