library(tidyverse)
library(GGally)
library(ggpubr)
bank <- read_delim("bank-full.csv", delim = ";",
escape_double = FALSE, trim_ws = TRUE)
Exploratory Data Analysis Review the structure and content of the data
glimpse(bank)
## Rows: 45,211
## Columns: 17
## $ age <dbl> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
and answer questions such as: Are the features (columns) of your data correlated? What are the relationships between different variables?
ggpairs(bank |> select(where(is.numeric)))
What is the overall distribution of each variable? How are categorical variables distributed?
# Splitting up predictors will help us load the distribution plots more cleanly
bank_vars1 <- bank |> select(where(is.numeric))
bank_vars2 <- bank |> select(where(is.character))
# Use purrr::map to visualize distributions for all predictors
distributions1 <- map(names(bank_vars1), ~ {
ggplot(bank_vars1, aes(x = .data[[.x]])) +
geom_histogram()})
distributions2 <- map(names(bank_vars2), ~ {
ggplot(bank_vars2, aes(x = fct_infreq(.data[[.x]]))) +
xlab(.x) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))})
ggarrange(plotlist = distributions1)
ggarrange(plotlist = distributions2)
Are there any outliers present?
outlier_counts <- function(x) {
q <- quantile(x, probs = c(.25, .75), na.rm = TRUE)
iqr <- q[2] - q[1]
lower <- q[1] - 1.5 * iqr
upper <- q[2] + 1.5 * iqr
list(outliers = x[x < lower | x > upper])
}
num_pred_names <- names(bank |> select(where(is.numeric)))
tibble(variable = num_pred_names,
outliers = sapply(bank[num_pred_names], outlier_counts),
n_outliers = lengths(outliers)) |>
arrange(desc(n_outliers))
## # A tibble: 7 × 3
## variable outliers n_outliers
## <chr> <named list> <int>
## 1 pdays <dbl [8,257]> 8257
## 2 previous <dbl [8,257]> 8257
## 3 balance <dbl [4,729]> 4729
## 4 duration <dbl [3,235]> 3235
## 5 campaign <dbl [3,064]> 3064
## 6 age <dbl [487]> 487
## 7 day <dbl [0]> 0
bank |>
select(pdays, previous, balance, duration, campaign, age, day) |>
pivot_longer(everything(), names_to = "variable", values_to = "value") |>
mutate(variable = factor(variable,
levels = rev(c("pdays","previous","balance",
"duration","campaign","age","day")))) |>
ggplot(aes(x = variable, y = value)) +
geom_boxplot() +
coord_flip() +
labs(title = "Visualizing Outliers", x = "", y = "")
Are there any missing values and how significant are they?
na_overall <- sum(is.na(bank))
na_by_col <- sapply(bank, \(x) sum(is.na(x)))
na_overall; na_by_col
## [1] 0
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
unknown_rates <- bank |>
summarize(
across(where(is.character), \(x) mean(x == "unknown", na.rm = TRUE))
) |>
pivot_longer(everything(), names_to = "variable", values_to = "prop_unknown") |>
arrange(desc(prop_unknown))
unknown_rates
## # A tibble: 10 × 2
## variable prop_unknown
## <chr> <dbl>
## 1 poutcome 0.817
## 2 contact 0.288
## 3 education 0.0411
## 4 job 0.00637
## 5 marital 0
## 6 default 0
## 7 housing 0
## 8 loan 0
## 9 month 0
## 10 y 0