1 Load libraries
2 Importing Data from file
- 2.1 Check Leakage and Target
3 Quality of Data Overview
- 3.0.1 Duplicates & consistency
- 3.0.2 check for Consistency
4 Distributions
5 Categorical Variables
- 5.0.1 Numeric correlations
- 5.0.2 Target relationships
6 Trends
7 Check for Imbalance
8 Example Train/Test

1 Load libraries

library(tidyverse)
library(skimr)
library(GGally)
library(janitor)
library(pROC)

options(dplyr.summarise.inform = FALSE)
theme_set(theme_minimal())

2 Importing Data from file

Using the Bank Marketing dataset (bank-additional-full.csv) from UCI ML Repository.

library(tidyverse)

# Loading downloaded dataset 
file_path <- "/Users/michaelrobinson/Downloads/bank+marketing/bank-additional/bank-additional-full.csv"

# Reading the dataset 
bank <- read_csv2(file_path)

## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.

## Rows: 41188 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (12): job, marital, education, default, housing, loan, contact, month, d...
## dbl  (5): age, duration, campaign, pdays, previous
## num  (4): emp.var.rate, cons.price.idx, cons.conf.idx, nr.employed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Taking a look at the data
glimpse(bank)

## Rows: 41,188
## Columns: 21
## $ age            <dbl> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job            <chr> "housemaid", "services", "services", "admin.", "service…
## $ marital        <chr> "married", "married", "married", "married", "married", …
## $ education      <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
## $ default        <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
## $ housing        <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
## $ loan           <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
## $ contact        <chr> "telephone", "telephone", "telephone", "telephone", "te…
## $ month          <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
## $ day_of_week    <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
## $ duration       <dbl> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays          <dbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome       <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
## $ emp.var.rate   <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,…
## $ cons.price.idx <dbl> 93994, 93994, 93994, 93994, 93994, 93994, 93994, 93994,…
## $ cons.conf.idx  <dbl> -364, -364, -364, -364, -364, -364, -364, -364, -364, -…
## $ euribor3m      <chr> "4.857", "4.857", "4.857", "4.857", "4.857", "4.857", "…
## $ nr.employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y              <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…

summary(bank)

##       age            job              marital           education        
##  Min.   :17.00   Length:41188       Length:41188       Length:41188      
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.02                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :98.00                                                           
##    default            housing              loan             contact         
##  Length:41188       Length:41188       Length:41188       Length:41188      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:41188       Length:41188       Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 102.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 180.0   Median : 2.000  
##                                        Mean   : 258.3   Mean   : 2.568  
##                                        3rd Qu.: 319.0   3rd Qu.: 3.000  
##                                        Max.   :4918.0   Max.   :56.000  
##      pdays          previous       poutcome          emp.var.rate     
##  Min.   :  0.0   Min.   :0.000   Length:41188       Min.   :-34.0000  
##  1st Qu.:999.0   1st Qu.:0.000   Class :character   1st Qu.:-18.0000  
##  Median :999.0   Median :0.000   Mode  :character   Median : 11.0000  
##  Mean   :962.5   Mean   :0.173                      Mean   :  0.9316  
##  3rd Qu.:999.0   3rd Qu.:0.000                      3rd Qu.: 14.0000  
##  Max.   :999.0   Max.   :7.000                      Max.   : 14.0000  
##  cons.price.idx  cons.conf.idx     euribor3m          nr.employed   
##  Min.   :  932   Min.   :-508.0   Length:41188       Min.   : 5191  
##  1st Qu.:92893   1st Qu.:-427.0   Class :character   1st Qu.:50175  
##  Median :93749   Median :-403.0   Mode  :character   Median :50991  
##  Mean   :85475   Mean   :-365.7                      Mean   :42865  
##  3rd Qu.:93994   3rd Qu.:-361.0                      3rd Qu.:52281  
##  Max.   :94767   Max.   : -33.0                      Max.   :52281  
##       y            
##  Length:41188      
##  Class :character  
##  Mode  :character  
##                    
##                    
##

2.1 Check Leakage and Target

We care about the outcome in the column labeled y, it tells us if the customer said yes or no to opening a term deposit. The column labeled duration, shows how long phone calls lasted. Call length are only known after calls are completed, therefore its not used in predicting coustomers answer ahead of time. this won’t be used for building models, but will be look at duringanalysis to see what patterns it shows.

table(bank$y)

## 
##    no   yes 
## 36548  4640

summary(bank$duration)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   102.0   180.0   258.3   319.0  4918.0

3 Quality of Data Overview

bank_na <- bank %>% mutate(across(where(is.character), ~na_if(.x, "unknown")))

skimr::skim(bank_na)

Data summary
Name	bank_na
Number of rows	41188
Number of columns	21
_______________________
Column type frequency:
character	12
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
job	330	0.99	6	13	11
marital	80	1.00	6	8	3
education	1731	0.96	8	19	7
default	8597	0.79	2	3	2
housing	990	0.98	2	3	2
loan	990	0.98	2	3	2
contact	0	1.00	8	9	2
month	0	1.00	3	3	10
day_of_week	0	1.00	3	3	5
poutcome	0	1.00	7	11	3
euribor3m	0	1.00	1	5	316
y	0	1.00	2	3	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
age	1	40.02	10.42	17	32	38	47	98	▅▇▃▁▁
duration	1	258.29	259.28	0	102	180	319	4918	▇▁▁▁▁
campaign	1	2.57	2.77	1	1	2	3	56	▇▁▁▁▁
pdays	1	962.48	186.91	0	999	999	999	999	▁▁▁▁▇
previous	1	0.17	0.49	0	0	0	0	7	▇▁▁▁▁
emp.var.rate	1	0.93	15.58	-34	-18	11	14	14	▁▃▁▁▇
cons.price.idx	1	85475.22	26234.18	932	92893	93749	93994	94767	▁▁▁▁▇
cons.conf.idx	1	-365.67	119.10	-508	-427	-403	-361	-33	▇▆▁▁▂
nr.employed	1	42864.89	18170.20	5191	50175	50991	52281	52281	▂▁▁▁▇

n_counts <- bank_na %>% summarise(across(everything(), ~sum(is.na(.x))))
t(n_counts)

##                [,1]
## age               0
## job             330
## marital          80
## education      1731
## default        8597
## housing         990
## loan            990
## contact           0
## month             0
## day_of_week       0
## duration          0
## campaign          0
## pdays             0
## previous          0
## poutcome          0
## emp.var.rate      0
## cons.price.idx    0
## cons.conf.idx     0
## euribor3m         0
## nr.employed       0
## y                 0

3.0.1 Duplicates & consistency

dup_rows <- bank %>% add_count(across(everything()), name = "dupe_n") %>% filter(dupe_n > 1)
n_distinct_rows <- nrow(distinct(bank))
list(total_rows = nrow(bank), distinct_rows = n_distinct_rows, duplicate_rows = nrow(bank) - n_distinct_rows)

## $total_rows
## [1] 41188
## 
## $distinct_rows
## [1] 41176
## 
## $duplicate_rows
## [1] 12

3.0.2 check for Consistency

issues <- list(
  negative_or_zero_duration = bank %>% filter(!is.na(duration) & duration <= 0) %>% nrow(),
  extreme_campaign_counts    = bank %>% filter(!is.na(campaign) & campaign < 1) %>% nrow(),
  pdays_special_999          = bank %>% filter(pdays == 999) %>% nrow(),
  negative_balances          = if ("emp.var.rate" %in% names(bank)) NA_integer_ else NA_integer_
)
issues

## $negative_or_zero_duration
## [1] 4
## 
## $extreme_campaign_counts
## [1] 0
## 
## $pdays_special_999
## [1] 39673
## 
## $negative_balances
## [1] NA

4 Distributions

numeric_vars <- bank %>% select(where(is.double) | where(is.integer))
numeric_vars %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  geom_histogram(bins = 30) +
  labs(title = "Numeric feature distributions")

numeric_vars %>%
  summarise(across(
    everything(),
    list(
      mean   = ~mean(.x, na.rm = TRUE),
      median = ~median(.x, na.rm = TRUE),
      sd     = ~sd(.x, na.rm = TRUE),
      q1     = ~quantile(.x, 0.25, na.rm = TRUE),
      q3     = ~quantile(.x, 0.75, na.rm = TRUE)
    )
  )) %>%
  pivot_longer(everything())

## # A tibble: 45 × 2
##    name            value
##    <chr>           <dbl>
##  1 age_mean         40.0
##  2 age_median       38  
##  3 age_sd           10.4
##  4 age_q1           32  
##  5 age_q3           47  
##  6 duration_mean   258. 
##  7 duration_median 180  
##  8 duration_sd     259. 
##  9 duration_q1     102  
## 10 duration_q3     319  
## # ℹ 35 more rows

numeric_vars %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot(outlier.alpha = 0.4) +
  coord_flip() +
  labs(title = "Boxplots to visualize outliers")

5 Categorical Variables

cat_vars <- bank %>% select(where(is.character))
cat_vars %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = fct_infreq(value))) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  geom_bar() + coord_flip() +
  labs(title = "Categorical feature distributions", x = "Level", y = "Count")

5.0.1 Numeric correlations

corr_df <- bank %>% select(where(is.numeric), -duration)
if (!is.null(corr_df) && ncol(corr_df) > 1) {
  GGally::ggcorr(corr_df, label = TRUE, label_round = 2, hjust = 0.9, size = 3) +
    labs(title = "Correlation matrix (numeric features only, excluding duration)")
}

5.0.2 Target relationships

bank %>%
  select(where(is.numeric), y) %>%
  pivot_longer(-y, names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = y, y = value)) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  geom_boxplot() +
  labs(title = "Numeric features vs. target (y)")

cat_vs_target <- cat_vars %>%
  mutate(y = bank$y) %>%
  pivot_longer(-y, names_to = "variable", values_to = "value") %>%
  count(variable, value, y) %>% group_by(variable) %>%
  mutate(prop = n / sum(n))

ggplot(cat_vs_target, aes(x = value, y = prop, fill = y)) +
  facet_wrap(~ variable, scales = "free", ncol = 3) +
  geom_col(position = "fill") +
  coord_flip() +
  labs(title = "Categorical features vs. target (proportional)", y = "Proportion")

6 Trends

bank_fe <- bank %>%
  mutate(
    recent_contact = if_else(pdays == 999, "no_prior", "prior"),
    call_intensity = campaign + previous,
    age_bucket = cut(age, breaks = c(-Inf, 29, 59, Inf), labels = c("under30","30to59","60plus"))
  )

glimpse(bank_fe)

## Rows: 41,188
## Columns: 24
## $ age            <dbl> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job            <chr> "housemaid", "services", "services", "admin.", "service…
## $ marital        <chr> "married", "married", "married", "married", "married", …
## $ education      <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
## $ default        <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
## $ housing        <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
## $ loan           <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
## $ contact        <chr> "telephone", "telephone", "telephone", "telephone", "te…
## $ month          <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
## $ day_of_week    <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
## $ duration       <dbl> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays          <dbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome       <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
## $ emp.var.rate   <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,…
## $ cons.price.idx <dbl> 93994, 93994, 93994, 93994, 93994, 93994, 93994, 93994,…
## $ cons.conf.idx  <dbl> -364, -364, -364, -364, -364, -364, -364, -364, -364, -…
## $ euribor3m      <chr> "4.857", "4.857", "4.857", "4.857", "4.857", "4.857", "…
## $ nr.employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y              <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
## $ recent_contact <chr> "no_prior", "no_prior", "no_prior", "no_prior", "no_pri…
## $ call_intensity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ age_bucket     <fct> 30to59, 30to59, 30to59, 30to59, 30to59, 30to59, 30to59,…

7 Check for Imbalance

target_rate <- mean(bank$y == "yes")
list(positive_rate = target_rate, counts = table(bank$y))

## $positive_rate
## [1] 0.1126542
## 
## $counts
## 
##    no   yes 
## 36548  4640

8 Example Train/Test

This is an example how the data is prepared for modeling. the duration column is removed to avoid leakage, the target variable y is set as yes/no, and all the other features are converted into numbers using.

set.seed(42)

bank_model <- bank %>% select(-duration) 
bank_model$y <- factor(bank_model$y, levels = c("no", "yes"))

x <- model.matrix(y ~ . , data = bank_model)[,-1]  
y <- bank_model$y

dim(x)

## [1] 41188   366

table(y)

## y
##    no   yes 
## 36548  4640

Exploratory Data Analysis: UCI Bank Marketing

Michael Robinson

September 28, 2025