library(tidyverse)
library(skimr)
library(GGally)
library(janitor)
library(pROC)
options(dplyr.summarise.inform = FALSE)
theme_set(theme_minimal())
Using the Bank Marketing dataset (bank-additional-full.csv) from UCI ML Repository.
library(tidyverse)
# Loading downloaded dataset
file_path <- "/Users/michaelrobinson/Downloads/bank+marketing/bank-additional/bank-additional-full.csv"
# Reading the dataset
bank <- read_csv2(file_path)
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 41188 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (12): job, marital, education, default, housing, loan, contact, month, d...
## dbl (5): age, duration, campaign, pdays, previous
## num (4): emp.var.rate, cons.price.idx, cons.conf.idx, nr.employed
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Taking a look at the data
glimpse(bank)
## Rows: 41,188
## Columns: 21
## $ age <dbl> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job <chr> "housemaid", "services", "services", "admin.", "service…
## $ marital <chr> "married", "married", "married", "married", "married", …
## $ education <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
## $ default <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
## $ housing <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
## $ loan <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
## $ contact <chr> "telephone", "telephone", "telephone", "telephone", "te…
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
## $ day_of_week <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
## $ duration <dbl> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays <dbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
## $ emp.var.rate <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,…
## $ cons.price.idx <dbl> 93994, 93994, 93994, 93994, 93994, 93994, 93994, 93994,…
## $ cons.conf.idx <dbl> -364, -364, -364, -364, -364, -364, -364, -364, -364, -…
## $ euribor3m <chr> "4.857", "4.857", "4.857", "4.857", "4.857", "4.857", "…
## $ nr.employed <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
summary(bank)
## age job marital education
## Min. :17.00 Length:41188 Length:41188 Length:41188
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.02
## 3rd Qu.:47.00
## Max. :98.00
## default housing loan contact
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:41188 Length:41188 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 102.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 180.0 Median : 2.000
## Mean : 258.3 Mean : 2.568
## 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :4918.0 Max. :56.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.000 Length:41188 Min. :-34.0000
## 1st Qu.:999.0 1st Qu.:0.000 Class :character 1st Qu.:-18.0000
## Median :999.0 Median :0.000 Mode :character Median : 11.0000
## Mean :962.5 Mean :0.173 Mean : 0.9316
## 3rd Qu.:999.0 3rd Qu.:0.000 3rd Qu.: 14.0000
## Max. :999.0 Max. :7.000 Max. : 14.0000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Min. : 932 Min. :-508.0 Length:41188 Min. : 5191
## 1st Qu.:92893 1st Qu.:-427.0 Class :character 1st Qu.:50175
## Median :93749 Median :-403.0 Mode :character Median :50991
## Mean :85475 Mean :-365.7 Mean :42865
## 3rd Qu.:93994 3rd Qu.:-361.0 3rd Qu.:52281
## Max. :94767 Max. : -33.0 Max. :52281
## y
## Length:41188
## Class :character
## Mode :character
##
##
##
We care about the outcome in the column labeled y, it tells us if the customer said yes or no to opening a term deposit. The column labeled duration, shows how long phone calls lasted. Call length are only known after calls are completed, therefore its not used in predicting coustomers answer ahead of time. this won’t be used for building models, but will be look at duringanalysis to see what patterns it shows.
table(bank$y)
##
## no yes
## 36548 4640
summary(bank$duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 102.0 180.0 258.3 319.0 4918.0
bank_na <- bank %>% mutate(across(where(is.character), ~na_if(.x, "unknown")))
skimr::skim(bank_na)
| Name | bank_na |
| Number of rows | 41188 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 12 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| job | 330 | 0.99 | 6 | 13 | 0 | 11 | 0 |
| marital | 80 | 1.00 | 6 | 8 | 0 | 3 | 0 |
| education | 1731 | 0.96 | 8 | 19 | 0 | 7 | 0 |
| default | 8597 | 0.79 | 2 | 3 | 0 | 2 | 0 |
| housing | 990 | 0.98 | 2 | 3 | 0 | 2 | 0 |
| loan | 990 | 0.98 | 2 | 3 | 0 | 2 | 0 |
| contact | 0 | 1.00 | 8 | 9 | 0 | 2 | 0 |
| month | 0 | 1.00 | 3 | 3 | 0 | 10 | 0 |
| day_of_week | 0 | 1.00 | 3 | 3 | 0 | 5 | 0 |
| poutcome | 0 | 1.00 | 7 | 11 | 0 | 3 | 0 |
| euribor3m | 0 | 1.00 | 1 | 5 | 0 | 316 | 0 |
| y | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 40.02 | 10.42 | 17 | 32 | 38 | 47 | 98 | ▅▇▃▁▁ |
| duration | 0 | 1 | 258.29 | 259.28 | 0 | 102 | 180 | 319 | 4918 | ▇▁▁▁▁ |
| campaign | 0 | 1 | 2.57 | 2.77 | 1 | 1 | 2 | 3 | 56 | ▇▁▁▁▁ |
| pdays | 0 | 1 | 962.48 | 186.91 | 0 | 999 | 999 | 999 | 999 | ▁▁▁▁▇ |
| previous | 0 | 1 | 0.17 | 0.49 | 0 | 0 | 0 | 0 | 7 | ▇▁▁▁▁ |
| emp.var.rate | 0 | 1 | 0.93 | 15.58 | -34 | -18 | 11 | 14 | 14 | ▁▃▁▁▇ |
| cons.price.idx | 0 | 1 | 85475.22 | 26234.18 | 932 | 92893 | 93749 | 93994 | 94767 | ▁▁▁▁▇ |
| cons.conf.idx | 0 | 1 | -365.67 | 119.10 | -508 | -427 | -403 | -361 | -33 | ▇▆▁▁▂ |
| nr.employed | 0 | 1 | 42864.89 | 18170.20 | 5191 | 50175 | 50991 | 52281 | 52281 | ▂▁▁▁▇ |
n_counts <- bank_na %>% summarise(across(everything(), ~sum(is.na(.x))))
t(n_counts)
## [,1]
## age 0
## job 330
## marital 80
## education 1731
## default 8597
## housing 990
## loan 990
## contact 0
## month 0
## day_of_week 0
## duration 0
## campaign 0
## pdays 0
## previous 0
## poutcome 0
## emp.var.rate 0
## cons.price.idx 0
## cons.conf.idx 0
## euribor3m 0
## nr.employed 0
## y 0
dup_rows <- bank %>% add_count(across(everything()), name = "dupe_n") %>% filter(dupe_n > 1)
n_distinct_rows <- nrow(distinct(bank))
list(total_rows = nrow(bank), distinct_rows = n_distinct_rows, duplicate_rows = nrow(bank) - n_distinct_rows)
## $total_rows
## [1] 41188
##
## $distinct_rows
## [1] 41176
##
## $duplicate_rows
## [1] 12
issues <- list(
negative_or_zero_duration = bank %>% filter(!is.na(duration) & duration <= 0) %>% nrow(),
extreme_campaign_counts = bank %>% filter(!is.na(campaign) & campaign < 1) %>% nrow(),
pdays_special_999 = bank %>% filter(pdays == 999) %>% nrow(),
negative_balances = if ("emp.var.rate" %in% names(bank)) NA_integer_ else NA_integer_
)
issues
## $negative_or_zero_duration
## [1] 4
##
## $extreme_campaign_counts
## [1] 0
##
## $pdays_special_999
## [1] 39673
##
## $negative_balances
## [1] NA
numeric_vars <- bank %>% select(where(is.double) | where(is.integer))
numeric_vars %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value)) +
facet_wrap(~ variable, scales = "free", ncol = 3) +
geom_histogram(bins = 30) +
labs(title = "Numeric feature distributions")
numeric_vars %>%
summarise(across(
everything(),
list(
mean = ~mean(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
q1 = ~quantile(.x, 0.25, na.rm = TRUE),
q3 = ~quantile(.x, 0.75, na.rm = TRUE)
)
)) %>%
pivot_longer(everything())
## # A tibble: 45 × 2
## name value
## <chr> <dbl>
## 1 age_mean 40.0
## 2 age_median 38
## 3 age_sd 10.4
## 4 age_q1 32
## 5 age_q3 47
## 6 duration_mean 258.
## 7 duration_median 180
## 8 duration_sd 259.
## 9 duration_q1 102
## 10 duration_q3 319
## # ℹ 35 more rows
numeric_vars %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = variable, y = value)) +
geom_boxplot(outlier.alpha = 0.4) +
coord_flip() +
labs(title = "Boxplots to visualize outliers")
cat_vars <- bank %>% select(where(is.character))
cat_vars %>%
pivot_longer(everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = fct_infreq(value))) +
facet_wrap(~ variable, scales = "free", ncol = 3) +
geom_bar() + coord_flip() +
labs(title = "Categorical feature distributions", x = "Level", y = "Count")
corr_df <- bank %>% select(where(is.numeric), -duration)
if (!is.null(corr_df) && ncol(corr_df) > 1) {
GGally::ggcorr(corr_df, label = TRUE, label_round = 2, hjust = 0.9, size = 3) +
labs(title = "Correlation matrix (numeric features only, excluding duration)")
}
bank %>%
select(where(is.numeric), y) %>%
pivot_longer(-y, names_to = "variable", values_to = "value") %>%
ggplot(aes(x = y, y = value)) +
facet_wrap(~ variable, scales = "free", ncol = 3) +
geom_boxplot() +
labs(title = "Numeric features vs. target (y)")
cat_vs_target <- cat_vars %>%
mutate(y = bank$y) %>%
pivot_longer(-y, names_to = "variable", values_to = "value") %>%
count(variable, value, y) %>% group_by(variable) %>%
mutate(prop = n / sum(n))
ggplot(cat_vs_target, aes(x = value, y = prop, fill = y)) +
facet_wrap(~ variable, scales = "free", ncol = 3) +
geom_col(position = "fill") +
coord_flip() +
labs(title = "Categorical features vs. target (proportional)", y = "Proportion")
bank_fe <- bank %>%
mutate(
recent_contact = if_else(pdays == 999, "no_prior", "prior"),
call_intensity = campaign + previous,
age_bucket = cut(age, breaks = c(-Inf, 29, 59, Inf), labels = c("under30","30to59","60plus"))
)
glimpse(bank_fe)
## Rows: 41,188
## Columns: 24
## $ age <dbl> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job <chr> "housemaid", "services", "services", "admin.", "service…
## $ marital <chr> "married", "married", "married", "married", "married", …
## $ education <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
## $ default <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
## $ housing <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
## $ loan <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
## $ contact <chr> "telephone", "telephone", "telephone", "telephone", "te…
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
## $ day_of_week <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
## $ duration <dbl> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays <dbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
## $ emp.var.rate <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,…
## $ cons.price.idx <dbl> 93994, 93994, 93994, 93994, 93994, 93994, 93994, 93994,…
## $ cons.conf.idx <dbl> -364, -364, -364, -364, -364, -364, -364, -364, -364, -…
## $ euribor3m <chr> "4.857", "4.857", "4.857", "4.857", "4.857", "4.857", "…
## $ nr.employed <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
## $ recent_contact <chr> "no_prior", "no_prior", "no_prior", "no_prior", "no_pri…
## $ call_intensity <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ age_bucket <fct> 30to59, 30to59, 30to59, 30to59, 30to59, 30to59, 30to59,…
target_rate <- mean(bank$y == "yes")
list(positive_rate = target_rate, counts = table(bank$y))
## $positive_rate
## [1] 0.1126542
##
## $counts
##
## no yes
## 36548 4640
This is an example how the data is prepared for modeling. the duration column is removed to avoid leakage, the target variable y is set as yes/no, and all the other features are converted into numbers using.
set.seed(42)
bank_model <- bank %>% select(-duration)
bank_model$y <- factor(bank_model$y, levels = c("no", "yes"))
x <- model.matrix(y ~ . , data = bank_model)[,-1]
y <- bank_model$y
dim(x)
## [1] 41188 366
table(y)
## y
## no yes
## 36548 4640