# ============================================================
# Bank Marketing EDA — inline-only (Posit Cloud friendly)
# Reads CSVs from two folders: "bank" and "bank-additional"
# No files are written; all outputs render in the knitted doc.
# ============================================================
# Packages
to_install <- c(
"tidyverse","janitor","skimr","corrplot","scales","glue","knitr"
)
new_pkgs <- setdiff(to_install, rownames(installed.packages()))
if (length(new_pkgs)) install.packages(new_pkgs, dependencies = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
library(corrplot)
## corrplot 0.95 loaded
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(glue)
library(knitr)
options(stringsAsFactors = FALSE)
theme_set(theme_minimal(base_size = 12))
knitr::opts_chunk$set(fig.width = 8, fig.height = 4.5, fig.align = "center")
win_bank <- "C:/Users/snaqw/Downloads/bank+marketing (1)/bank"
win_add <- "C:/Users/snaqw/Downloads/bank+marketing (1)/bank-additional"
rel_bank <- "./bank+marketing (1)/bank"
rel_add <- "./bank+marketing (1)/bank-additional"
bank_dir <- if (dir.exists(win_bank)) win_bank else rel_bank
bank_additional_dir <- if (dir.exists(win_add)) win_add else rel_add
for (d in c(bank_dir, bank_additional_dir)) {
if (!dir.exists(d)) stop(sprintf("Folder does not exist:\n %s\n\nTip: In Posit Cloud, upload the two folders into your project so these relative paths work.", d))
}
bank_files <- list.files(bank_dir, pattern = "\\.csv$", ignore.case = TRUE, full.names = TRUE)
bank_additional_files <- list.files(bank_additional_dir, pattern = "\\.csv$", ignore.case = TRUE, full.names = TRUE)
files <- c(bank_files, bank_additional_files)
if (!length(files)) stop("No CSV files found in either folder.")
cat("Found CSV files:\n")
## Found CSV files:
cat(paste0(" - ", basename(files), collapse = "\n"), "\n\n")
## - bank-full.csv
## - bank.csv
read_bank_csv_smart <- function(path) {
message("Reading: ", basename(path))
a <- tryCatch(read.csv(path, sep = ";", stringsAsFactors = FALSE, check.names = FALSE),
error = function(e) NULL)
b <- tryCatch(read.csv(path, sep = ",", stringsAsFactors = FALSE, check.names = FALSE),
error = function(e) NULL)
cand <- list(semicolon = a, comma = b)
ncols <- sapply(cand, function(x) if (is.null(x)) 0L else ncol(x))
pick <- ifelse(ncols["semicolon"] >= ncols["comma"], "semicolon", "comma")
df <- cand[[pick]]
if (is.null(df) || ncol(df) <= 1) stop("Failed to parse columns; check file integrity for: ", path)
df <- janitor::clean_names(df)
message(" chose '", pick, "' delimiter; ncol = ", ncol(df))
message(" first 6 cols: ", paste(head(names(df), 6), collapse = ", "))
df
}
make_numeric_if_looks_numeric <- function(df) {
df %>%
mutate(across(where(is.character), \(x) {
non_na <- x[!is.na(x)]
pct_numlike <- if (length(non_na)) mean(grepl("^[-+]?\\d*\\.?\\d+$", non_na)) else 0
if (pct_numlike > 0.9) suppressWarnings(as.numeric(x)) else x
}))
}
safe_cor <- function(df_num) {
ok <- sapply(df_num, \(v) is.numeric(v) && sd(v, na.rm = TRUE) > 0)
df_ok <- df_num[, ok, drop = FALSE]
if (ncol(df_ok) < 2) return(NULL)
suppressWarnings(cor(df_ok, use = "pairwise.complete.obs"))
}
# ---------- EDA for one file (inline outputs only) ----------
run_eda <- function(csv_path) {
base <- tools::file_path_sans_ext(basename(csv_path))
cat("\n\n### Dataset:", base, "\n\n")
df <- read_bank_csv_smart(csv_path) %>%
mutate(across(where(is.character), ~na_if(., "unknown"))) %>%
make_numeric_if_looks_numeric()
# Target identification (common names in UCI Bank Marketing)
target <- intersect(c("y","target","subscribed"), names(df))
target <- if (length(target)) target[[1]] else NA_character_
# Quick structure + dimensions
cat(glue("**Rows:** {nrow(df)} **Cols:** {ncol(df)}\n\n"))
cat("**First 6 columns:** ", paste(head(names(df), 6), collapse = ", "), "\n\n")
cat("**Inferred target:** ", ifelse(is.na(target), "_None_", paste0("`", target, "`")), "\n\n")
# Skim summary (nice compact overview)
skim_out <- capture.output(print(skim(df)))
cat("<details><summary><b>Skim summary (click to expand)</b></summary>\n\n")
cat("```\n")
cat(paste(skim_out, collapse = "\n"))
cat("\n```\n</details>\n\n")
# Missing values table (top 20)
miss_tbl <- sort(sapply(df, \(x) sum(is.na(x))), decreasing = TRUE)
miss_df <- tibble(feature = names(miss_tbl), missing = as.integer(miss_tbl)) %>%
slice_head(n = 20)
if (nrow(miss_df)) {
cat("**Missing values (top 20):**\n\n")
knitr::kable(miss_df, align = "lr")
cat("\n\n")
}
# Numeric vs categorical split
num_cols <- names(df)[sapply(df, is.numeric)]
cat_cols <- setdiff(names(df), num_cols)
# Central tendency & spread for numeric columns
if (length(num_cols)) {
cat("**Numeric summary (mean/median/sd/min/Q1/Q3/max):**\n\n")
ct <- df %>% summarise(across(all_of(num_cols),
list(mean = ~mean(., na.rm=TRUE),
median = ~median(., na.rm=TRUE),
sd = ~sd(., na.rm=TRUE),
min = ~min(., na.rm=TRUE),
q1 = ~quantile(., .25, na.rm=TRUE),
q3 = ~quantile(., .75, na.rm=TRUE),
max = ~max(., na.rm=TRUE)),
.names = "{.col}.{.fn}"))
ct_long <- tidyr::pivot_longer(ct, everything(),
names_to = c("feature","stat"),
names_sep="\\.", values_to = "value")
knitr::kable(ct_long, digits = 3)
cat("\n\n")
}
# Distributions (numeric) — show up to 12 to keep doc readable
if (length(num_cols)) {
max_show <- min(length(num_cols), 12)
cat(glue("**Distributions for numeric features (showing {max_show} of {length(num_cols)}):**\n\n"))
for (c in head(num_cols, max_show)) {
p <- ggplot(df, aes(x = .data[[c]])) +
geom_histogram(bins = 40, linewidth = 0.2, fill = "#c2d9ff", color = "grey25", na.rm = TRUE) +
labs(title = paste("Distribution:", c), x = c, y = "Count") +
theme(plot.title = element_text(face = "bold"))
print(p)
}
cat("\n")
}
# Categorical bars (top 25) — show up to 12 to keep doc readable
if (length(cat_cols)) {
max_show <- min(length(cat_cols), 12)
cat(glue("**Top-25 levels for categorical features (showing {max_show} of {length(cat_cols)}):**\n\n"))
for (c in head(cat_cols, max_show)) {
tmp <- df %>%
mutate(val = as.character(.data[[c]])) %>%
count(val, name = "n") %>%
arrange(desc(n)) %>% slice_head(n = 25)
p <- ggplot(tmp, aes(x = reorder(val, n), y = n)) +
geom_col(fill = "#b7e4c7", color = "grey25") + coord_flip() +
labs(title = paste("Counts (top 25):", c), x = c, y = "Count") +
theme(plot.title = element_text(face = "bold"))
print(p)
}
cat("\n")
}
# Correlation heatmap (numeric)
if (length(num_cols) >= 2) {
cmat <- safe_cor(df %>% select(all_of(num_cols)))
if (!is.null(cmat)) {
cat("**Correlation heatmap (numeric features):**\n\n")
corrplot(cmat, method = "color",
col = colorRampPalette(c("#313695","#4575b4","#74add1","#abd9e9",
"#e0f3f8","#ffffbf","#fee090","#fdae61",
"#f46d43","#d73027","#a50026"))(200),
tl.cex = 0.7, addgrid.col = "grey90", mar = c(0,0,2,0))
cat("\n\n")
} else {
cat("_Skipping correlation: insufficient numeric variance._\n\n")
}
}
# Outliers (IQR) — table only to avoid too many plots
if (length(num_cols)) {
iqr_info <- list()
for (c in num_cols) {
s <- df[[c]]; s <- s[!is.na(s)]
if (length(s)) {
q1 <- quantile(s, 0.25, na.rm=TRUE); q3 <- quantile(s, 0.75, na.rm=TRUE)
iqr <- q3 - q1; lower <- q1 - 1.5*iqr; upper <- q3 + 1.5*iqr
out_pct <- mean(s < lower | s > upper) * 100
iqr_info[[length(iqr_info)+1]] <- data.frame(
feature=c, lower=as.numeric(lower), upper=as.numeric(upper),
iqr=as.numeric(iqr), outlier_pct=as.numeric(out_pct)
)
}
}
if (length(iqr_info)) {
iqr_df <- dplyr::bind_rows(iqr_info) %>% dplyr::arrange(dplyr::desc(outlier_pct))
cat("**Outlier percentage (IQR rule):**\n\n")
knitr::kable(iqr_df, digits = 3)
cat("\n\n")
}
}
# Target distribution (if present)
if (!is.na(target) && target %in% names(df)) {
cat(glue("**Target distribution: `{target}`**\n\n"))
p <- df %>% count(.data[[target]]) %>%
ggplot(aes(x = .data[[target]], y = n)) +
geom_col(fill = "#ffd166", color = "grey25") +
labs(title = paste("Target Distribution:", target), x = target, y = "Count") +
theme(plot.title = element_text(face = "bold"))
print(p)
cat("\n")
}
invisible(TRUE)
}
# ---------- Run for all found files (inline) ----------
invisible(lapply(files, run_eda))
##
##
## ### Dataset: bank-full
## Reading: bank-full.csv
## chose 'comma' delimiter; ncol = 17
## first 6 cols: age, job, marital, education, default, balance
## **Rows:** 45211 **Cols:** 17
## **First 6 columns:** age, job, marital, education, default, balance
##
## **Inferred target:** `y`
##
## <details><summary><b>Skim summary (click to expand)</b></summary>
##
## ```
## ── Data Summary ────────────────────────
## Values
## Name df
## Number of rows 45211
## Number of columns 17
## _______________________
## Column type frequency:
## character 10
## numeric 7
## ________________________
## Group variables None
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 job 288 0.994 6 13 0 11 0
## 2 marital 0 1 6 8 0 3 0
## 3 education 1857 0.959 7 9 0 3 0
## 4 default 0 1 2 3 0 2 0
## 5 housing 0 1 2 3 0 2 0
## 6 loan 0 1 2 3 0 2 0
## 7 contact 13020 0.712 8 9 0 2 0
## 8 month 0 1 3 3 0 12 0
## 9 poutcome 36959 0.183 5 7 0 3 0
## 10 y 0 1 2 3 0 2 0
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50 p75
## 1 age 0 1 40.9 10.6 18 33 39 48
## 2 balance 0 1 1362. 3045. -8019 72 448 1428
## 3 day 0 1 15.8 8.32 1 8 16 21
## 4 duration 0 1 258. 258. 0 103 180 319
## 5 campaign 0 1 2.76 3.10 1 1 2 3
## 6 pdays 0 1 40.2 100. -1 -1 -1 -1
## 7 previous 0 1 0.580 2.30 0 0 0 0
## p100 hist
## 1 95 ▅▇▃▁▁
## 2 102127 ▇▁▁▁▁
## 3 31 ▇▆▇▆▆
## 4 4918 ▇▁▁▁▁
## 5 63 ▇▁▁▁▁
## 6 871 ▇▁▁▁▁
## 7 275 ▇▁▁▁▁
## ```
## </details>
##
## **Missing values (top 20):**
##
##
##
## **Numeric summary (mean/median/sd/min/Q1/Q3/max):**
##
##
##
## **Distributions for numeric features (showing 7 of 7):**







##
## **Top-25 levels for categorical features (showing 10 of 10):**










##
## **Correlation heatmap (numeric features):**

##
##
## **Outlier percentage (IQR rule):**
##
##
##
## **Target distribution: `y`**

##
##
##
## ### Dataset: bank
## Reading: bank.csv
## chose 'comma' delimiter; ncol = 17
## first 6 cols: age, job, marital, education, default, balance
## **Rows:** 4521 **Cols:** 17
## **First 6 columns:** age, job, marital, education, default, balance
##
## **Inferred target:** `y`
##
## <details><summary><b>Skim summary (click to expand)</b></summary>
##
## ```
## ── Data Summary ────────────────────────
## Values
## Name df
## Number of rows 4521
## Number of columns 17
## _______________________
## Column type frequency:
## character 10
## numeric 7
## ________________________
## Group variables None
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 job 38 0.992 6 13 0 11 0
## 2 marital 0 1 6 8 0 3 0
## 3 education 187 0.959 7 9 0 3 0
## 4 default 0 1 2 3 0 2 0
## 5 housing 0 1 2 3 0 2 0
## 6 loan 0 1 2 3 0 2 0
## 7 contact 1324 0.707 8 9 0 2 0
## 8 month 0 1 3 3 0 12 0
## 9 poutcome 3705 0.180 5 7 0 3 0
## 10 y 0 1 2 3 0 2 0
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50 p75
## 1 age 0 1 41.2 10.6 19 33 39 49
## 2 balance 0 1 1423. 3010. -3313 69 444 1480
## 3 day 0 1 15.9 8.25 1 9 16 21
## 4 duration 0 1 264. 260. 4 104 185 329
## 5 campaign 0 1 2.79 3.11 1 1 2 3
## 6 pdays 0 1 39.8 100. -1 -1 -1 -1
## 7 previous 0 1 0.543 1.69 0 0 0 0
## p100 hist
## 1 87 ▅▇▅▁▁
## 2 71188 ▇▁▁▁▁
## 3 31 ▆▆▇▅▆
## 4 3025 ▇▁▁▁▁
## 5 50 ▇▁▁▁▁
## 6 871 ▇▁▁▁▁
## 7 25 ▇▁▁▁▁
## ```
## </details>
##
## **Missing values (top 20):**
##
##
##
## **Numeric summary (mean/median/sd/min/Q1/Q3/max):**
##
##
##
## **Distributions for numeric features (showing 7 of 7):**







##
## **Top-25 levels for categorical features (showing 10 of 10):**










##
## **Correlation heatmap (numeric features):**

##
##
## **Outlier percentage (IQR rule):**
##
##
##
## **Target distribution: `y`**

cat("\n\n---\nAll analyses complete. Plots and tables are embedded above.\n")
##
##
## ---
## All analyses complete. Plots and tables are embedded above.