Load packages

library(tidyverse)
library(GGally)
library(ggpubr)

Load data

bank <- read_delim("bank-full.csv", delim = ";",
                   escape_double = FALSE, trim_ws = TRUE)

Exploratory Data Analysis Review the structure and content of the data

glimpse(bank)
## Rows: 45,211
## Columns: 17
## $ age       <dbl> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job       <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital   <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance   <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing   <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan      <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact   <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day       <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month     <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration  <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays     <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome  <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …

and answer questions such as: Are the features (columns) of your data correlated? What are the relationships between different variables?

ggpairs(bank |> select(where(is.numeric)))

What is the overall distribution of each variable? How are categorical variables distributed?

# Splitting up predictors will help us load the distribution plots more cleanly
bank_vars1 <- bank |> select(where(is.numeric))
bank_vars2 <- bank |> select(where(is.character))


# Use purrr::map to visualize distributions for all predictors
distributions1 <- map(names(bank_vars1), ~ {
  ggplot(bank_vars1, aes(x = .data[[.x]])) +
    geom_histogram()})

distributions2 <- map(names(bank_vars2), ~ {
  ggplot(bank_vars2, aes(x = fct_infreq(.data[[.x]]))) +
    xlab(.x) +
    geom_bar() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))})


ggarrange(plotlist = distributions1)

ggarrange(plotlist = distributions2)

Are there any outliers present?

outlier_counts <- function(x) {
  q <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
  iqr <- q[2] - q[1]
  lower <- q[1] - 1.5 * iqr
  upper <- q[2] + 1.5 * iqr
  list(outliers = x[x < lower | x > upper])
}

num_pred_names <- names(bank |> select(where(is.numeric)))

tibble(variable = num_pred_names,
       outliers = sapply(bank[num_pred_names], outlier_counts),
       n_outliers = lengths(outliers)) |>
  arrange(desc(n_outliers))
## # A tibble: 7 × 3
##   variable outliers      n_outliers
##   <chr>    <named list>       <int>
## 1 pdays    <dbl [8,257]>       8257
## 2 previous <dbl [8,257]>       8257
## 3 balance  <dbl [4,729]>       4729
## 4 duration <dbl [3,235]>       3235
## 5 campaign <dbl [3,064]>       3064
## 6 age      <dbl [487]>          487
## 7 day      <dbl [0]>              0
bank |>
  select(pdays, previous, balance, duration, campaign, age, day) |>
  pivot_longer(everything(), names_to = "variable", values_to = "value") |>
  mutate(variable = factor(variable,
                           levels = rev(c("pdays","previous","balance",
                                      "duration","campaign","age","day")))) |>
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot() +
  coord_flip() +
  labs(title = "Visualizing Outliers", x = "", y = "")

Are there any missing values and how significant are they?

na_overall <- sum(is.na(bank))
na_by_col  <- sapply(bank, \(x) sum(is.na(x)))
na_overall; na_by_col
## [1] 0
##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0
unknown_rates <- bank |>
  summarize(
    across(where(is.character), \(x) mean(x == "unknown", na.rm = TRUE))
  ) |>
  pivot_longer(everything(), names_to = "variable", values_to = "prop_unknown") |>
  arrange(desc(prop_unknown))
unknown_rates
## # A tibble: 10 × 2
##    variable  prop_unknown
##    <chr>            <dbl>
##  1 poutcome       0.817  
##  2 contact        0.288  
##  3 education      0.0411 
##  4 job            0.00637
##  5 marital        0      
##  6 default        0      
##  7 housing        0      
##  8 loan           0      
##  9 month          0      
## 10 y              0