Assignment 1: Exploratory Data Analysis

# ============================================================
# Bank Marketing EDA — inline-only (Posit Cloud friendly)
# Reads CSVs from two folders: "bank" and "bank-additional"
# No files are written; all outputs render in the knitted doc.
# ============================================================

# Packages
to_install <- c(
  "tidyverse","janitor","skimr","corrplot","scales","glue","knitr"
)
new_pkgs <- setdiff(to_install, rownames(installed.packages()))
if (length(new_pkgs)) install.packages(new_pkgs, dependencies = TRUE)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(skimr)
library(corrplot)

## corrplot 0.95 loaded

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(glue)
library(knitr)

options(stringsAsFactors = FALSE)
theme_set(theme_minimal(base_size = 12))
knitr::opts_chunk$set(fig.width = 8, fig.height = 4.5, fig.align = "center")


win_bank <- "C:/Users/snaqw/Downloads/bank+marketing (1)/bank"
win_add  <- "C:/Users/snaqw/Downloads/bank+marketing (1)/bank-additional"

rel_bank <- "./bank+marketing (1)/bank"
rel_add  <- "./bank+marketing (1)/bank-additional"

bank_dir <- if (dir.exists(win_bank)) win_bank else rel_bank
bank_additional_dir <- if (dir.exists(win_add)) win_add else rel_add

for (d in c(bank_dir, bank_additional_dir)) {
  if (!dir.exists(d)) stop(sprintf("Folder does not exist:\n  %s\n\nTip: In Posit Cloud, upload the two folders into your project so these relative paths work.", d))
}
bank_files <- list.files(bank_dir, pattern = "\\.csv$", ignore.case = TRUE, full.names = TRUE)
bank_additional_files <- list.files(bank_additional_dir, pattern = "\\.csv$", ignore.case = TRUE, full.names = TRUE)

files <- c(bank_files, bank_additional_files)
if (!length(files)) stop("No CSV files found in either folder.")

cat("Found CSV files:\n")

## Found CSV files:

cat(paste0("  - ", basename(files), collapse = "\n"), "\n\n")

##   - bank-full.csv
##   - bank.csv

read_bank_csv_smart <- function(path) {
  message("Reading: ", basename(path))

  a <- tryCatch(read.csv(path, sep = ";", stringsAsFactors = FALSE, check.names = FALSE),
                error = function(e) NULL)
  b <- tryCatch(read.csv(path, sep = ",", stringsAsFactors = FALSE, check.names = FALSE),
                error = function(e) NULL)

  cand <- list(semicolon = a, comma = b)
  ncols <- sapply(cand, function(x) if (is.null(x)) 0L else ncol(x))
  pick  <- ifelse(ncols["semicolon"] >= ncols["comma"], "semicolon", "comma")
  df    <- cand[[pick]]

  if (is.null(df) || ncol(df) <= 1) stop("Failed to parse columns; check file integrity for: ", path)

  df <- janitor::clean_names(df)
  message("  chose '", pick, "' delimiter; ncol = ", ncol(df))
  message("  first 6 cols: ", paste(head(names(df), 6), collapse = ", "))
  df
}

make_numeric_if_looks_numeric <- function(df) {
  df %>%
    mutate(across(where(is.character), \(x) {
      non_na <- x[!is.na(x)]
      pct_numlike <- if (length(non_na)) mean(grepl("^[-+]?\\d*\\.?\\d+$", non_na)) else 0
      if (pct_numlike > 0.9) suppressWarnings(as.numeric(x)) else x
    }))
}

safe_cor <- function(df_num) {
  ok <- sapply(df_num, \(v) is.numeric(v) && sd(v, na.rm = TRUE) > 0)
  df_ok <- df_num[, ok, drop = FALSE]
  if (ncol(df_ok) < 2) return(NULL)
  suppressWarnings(cor(df_ok, use = "pairwise.complete.obs"))
}

# ---------- EDA for one file (inline outputs only) ----------
run_eda <- function(csv_path) {
  base <- tools::file_path_sans_ext(basename(csv_path))

  cat("\n\n### Dataset:", base, "\n\n")

  df <- read_bank_csv_smart(csv_path) %>%
        mutate(across(where(is.character), ~na_if(., "unknown"))) %>%
        make_numeric_if_looks_numeric()

  # Target identification (common names in UCI Bank Marketing)
  target <- intersect(c("y","target","subscribed"), names(df))
  target <- if (length(target)) target[[1]] else NA_character_

  # Quick structure + dimensions
  cat(glue("**Rows:** {nrow(df)} &nbsp;&nbsp; **Cols:** {ncol(df)}\n\n"))
  cat("**First 6 columns:** ", paste(head(names(df), 6), collapse = ", "), "\n\n")
  cat("**Inferred target:** ", ifelse(is.na(target), "_None_", paste0("`", target, "`")), "\n\n")

  # Skim summary (nice compact overview)
  skim_out <- capture.output(print(skim(df)))
  cat("<details><summary><b>Skim summary (click to expand)</b></summary>\n\n")
  cat("```\n")
  cat(paste(skim_out, collapse = "\n"))
  cat("\n```\n</details>\n\n")

  # Missing values table (top 20)
  miss_tbl <- sort(sapply(df, \(x) sum(is.na(x))), decreasing = TRUE)
  miss_df <- tibble(feature = names(miss_tbl), missing = as.integer(miss_tbl)) %>%
             slice_head(n = 20)
  if (nrow(miss_df)) {
    cat("**Missing values (top 20):**\n\n")
    knitr::kable(miss_df, align = "lr")
    cat("\n\n")
  }

  # Numeric vs categorical split
  num_cols <- names(df)[sapply(df, is.numeric)]
  cat_cols <- setdiff(names(df), num_cols)

  # Central tendency & spread for numeric columns
  if (length(num_cols)) {
    cat("**Numeric summary (mean/median/sd/min/Q1/Q3/max):**\n\n")
    ct <- df %>% summarise(across(all_of(num_cols),
                                  list(mean = ~mean(., na.rm=TRUE),
                                       median = ~median(., na.rm=TRUE),
                                       sd = ~sd(., na.rm=TRUE),
                                       min = ~min(., na.rm=TRUE),
                                       q1 = ~quantile(., .25, na.rm=TRUE),
                                       q3 = ~quantile(., .75, na.rm=TRUE),
                                       max = ~max(., na.rm=TRUE)),
                                  .names = "{.col}.{.fn}"))
    ct_long <- tidyr::pivot_longer(ct, everything(),
                                   names_to = c("feature","stat"),
                                   names_sep="\\.", values_to = "value")
    knitr::kable(ct_long, digits = 3)
    cat("\n\n")
  }

  # Distributions (numeric) — show up to 12 to keep doc readable
  if (length(num_cols)) {
    max_show <- min(length(num_cols), 12)
    cat(glue("**Distributions for numeric features (showing {max_show} of {length(num_cols)}):**\n\n"))
    for (c in head(num_cols, max_show)) {
      p <- ggplot(df, aes(x = .data[[c]])) +
        geom_histogram(bins = 40, linewidth = 0.2, fill = "#c2d9ff", color = "grey25", na.rm = TRUE) +
        labs(title = paste("Distribution:", c), x = c, y = "Count") +
        theme(plot.title = element_text(face = "bold"))
      print(p)
    }
    cat("\n")
  }

  # Categorical bars (top 25) — show up to 12 to keep doc readable
  if (length(cat_cols)) {
    max_show <- min(length(cat_cols), 12)
    cat(glue("**Top-25 levels for categorical features (showing {max_show} of {length(cat_cols)}):**\n\n"))
    for (c in head(cat_cols, max_show)) {
      tmp <- df %>%
        mutate(val = as.character(.data[[c]])) %>%
        count(val, name = "n") %>%
        arrange(desc(n)) %>% slice_head(n = 25)
      p <- ggplot(tmp, aes(x = reorder(val, n), y = n)) +
        geom_col(fill = "#b7e4c7", color = "grey25") + coord_flip() +
        labs(title = paste("Counts (top 25):", c), x = c, y = "Count") +
        theme(plot.title = element_text(face = "bold"))
      print(p)
    }
    cat("\n")
  }

  # Correlation heatmap (numeric)
  if (length(num_cols) >= 2) {
    cmat <- safe_cor(df %>% select(all_of(num_cols)))
    if (!is.null(cmat)) {
      cat("**Correlation heatmap (numeric features):**\n\n")
      corrplot(cmat, method = "color",
               col = colorRampPalette(c("#313695","#4575b4","#74add1","#abd9e9",
                                        "#e0f3f8","#ffffbf","#fee090","#fdae61",
                                        "#f46d43","#d73027","#a50026"))(200),
               tl.cex = 0.7, addgrid.col = "grey90", mar = c(0,0,2,0))
      cat("\n\n")
    } else {
      cat("_Skipping correlation: insufficient numeric variance._\n\n")
    }
  }

  # Outliers (IQR) — table only to avoid too many plots
  if (length(num_cols)) {
    iqr_info <- list()
    for (c in num_cols) {
      s <- df[[c]]; s <- s[!is.na(s)]
      if (length(s)) {
        q1 <- quantile(s, 0.25, na.rm=TRUE); q3 <- quantile(s, 0.75, na.rm=TRUE)
        iqr <- q3 - q1; lower <- q1 - 1.5*iqr; upper <- q3 + 1.5*iqr
        out_pct <- mean(s < lower | s > upper) * 100
        iqr_info[[length(iqr_info)+1]] <- data.frame(
          feature=c, lower=as.numeric(lower), upper=as.numeric(upper),
          iqr=as.numeric(iqr), outlier_pct=as.numeric(out_pct)
        )
      }
    }
    if (length(iqr_info)) {
      iqr_df <- dplyr::bind_rows(iqr_info) %>% dplyr::arrange(dplyr::desc(outlier_pct))
      cat("**Outlier percentage (IQR rule):**\n\n")
      knitr::kable(iqr_df, digits = 3)
      cat("\n\n")
    }
  }

  # Target distribution (if present)
  if (!is.na(target) && target %in% names(df)) {
    cat(glue("**Target distribution: `{target}`**\n\n"))
    p <- df %>% count(.data[[target]]) %>%
      ggplot(aes(x = .data[[target]], y = n)) +
      geom_col(fill = "#ffd166", color = "grey25") +
      labs(title = paste("Target Distribution:", target), x = target, y = "Count") +
      theme(plot.title = element_text(face = "bold"))
    print(p)
    cat("\n")
  }

  invisible(TRUE)
}

# ---------- Run for all found files (inline) ----------
invisible(lapply(files, run_eda))

## 
## 
## ### Dataset: bank-full

## Reading: bank-full.csv
##   chose 'comma' delimiter; ncol = 17
##   first 6 cols: age, job, marital, education, default, balance

## **Rows:** 45211 &nbsp;&nbsp; **Cols:** 17
## **First 6 columns:**  age, job, marital, education, default, balance 
## 
## **Inferred target:**  `y` 
## 
## <details><summary><b>Skim summary (click to expand)</b></summary>
## 
## ```
## ── Data Summary ────────────────────────
##                            Values
## Name                       df    
## Number of rows             45211 
## Number of columns          17    
## _______________________          
## Column type frequency:           
##   character                10    
##   numeric                  7     
## ________________________         
## Group variables            None  
## 
## ── Variable type: character ────────────────────────────────────────────────────
##    skim_variable n_missing complete_rate min max empty n_unique whitespace
##  1 job                 288         0.994   6  13     0       11          0
##  2 marital               0         1       6   8     0        3          0
##  3 education          1857         0.959   7   9     0        3          0
##  4 default               0         1       2   3     0        2          0
##  5 housing               0         1       2   3     0        2          0
##  6 loan                  0         1       2   3     0        2          0
##  7 contact           13020         0.712   8   9     0        2          0
##  8 month                 0         1       3   3     0       12          0
##  9 poutcome          36959         0.183   5   7     0        3          0
## 10 y                     0         1       2   3     0        2          0
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate     mean      sd    p0 p25 p50  p75
## 1 age                   0             1   40.9     10.6     18  33  39   48
## 2 balance               0             1 1362.    3045.   -8019  72 448 1428
## 3 day                   0             1   15.8      8.32     1   8  16   21
## 4 duration              0             1  258.     258.       0 103 180  319
## 5 campaign              0             1    2.76     3.10     1   1   2    3
## 6 pdays                 0             1   40.2    100.      -1  -1  -1   -1
## 7 previous              0             1    0.580    2.30     0   0   0    0
##     p100 hist 
## 1     95 ▅▇▃▁▁
## 2 102127 ▇▁▁▁▁
## 3     31 ▇▆▇▆▆
## 4   4918 ▇▁▁▁▁
## 5     63 ▇▁▁▁▁
## 6    871 ▇▁▁▁▁
## 7    275 ▇▁▁▁▁
## ```
## </details>
## 
## **Missing values (top 20):**
## 
## 
## 
## **Numeric summary (mean/median/sd/min/Q1/Q3/max):**
## 
## 
## 
## **Distributions for numeric features (showing 7 of 7):**

## 
## **Top-25 levels for categorical features (showing 10 of 10):**

## 
## **Correlation heatmap (numeric features):**

## 
## 
## **Outlier percentage (IQR rule):**
## 
## 
## 
## **Target distribution: `y`**

## 
## 
## 
## ### Dataset: bank

## Reading: bank.csv
##   chose 'comma' delimiter; ncol = 17
##   first 6 cols: age, job, marital, education, default, balance

## **Rows:** 4521 &nbsp;&nbsp; **Cols:** 17
## **First 6 columns:**  age, job, marital, education, default, balance 
## 
## **Inferred target:**  `y` 
## 
## <details><summary><b>Skim summary (click to expand)</b></summary>
## 
## ```
## ── Data Summary ────────────────────────
##                            Values
## Name                       df    
## Number of rows             4521  
## Number of columns          17    
## _______________________          
## Column type frequency:           
##   character                10    
##   numeric                  7     
## ________________________         
## Group variables            None  
## 
## ── Variable type: character ────────────────────────────────────────────────────
##    skim_variable n_missing complete_rate min max empty n_unique whitespace
##  1 job                  38         0.992   6  13     0       11          0
##  2 marital               0         1       6   8     0        3          0
##  3 education           187         0.959   7   9     0        3          0
##  4 default               0         1       2   3     0        2          0
##  5 housing               0         1       2   3     0        2          0
##  6 loan                  0         1       2   3     0        2          0
##  7 contact            1324         0.707   8   9     0        2          0
##  8 month                 0         1       3   3     0       12          0
##  9 poutcome           3705         0.180   5   7     0        3          0
## 10 y                     0         1       2   3     0        2          0
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable n_missing complete_rate     mean      sd    p0 p25 p50  p75
## 1 age                   0             1   41.2     10.6     19  33  39   49
## 2 balance               0             1 1423.    3010.   -3313  69 444 1480
## 3 day                   0             1   15.9      8.25     1   9  16   21
## 4 duration              0             1  264.     260.       4 104 185  329
## 5 campaign              0             1    2.79     3.11     1   1   2    3
## 6 pdays                 0             1   39.8    100.      -1  -1  -1   -1
## 7 previous              0             1    0.543    1.69     0   0   0    0
##    p100 hist 
## 1    87 ▅▇▅▁▁
## 2 71188 ▇▁▁▁▁
## 3    31 ▆▆▇▅▆
## 4  3025 ▇▁▁▁▁
## 5    50 ▇▁▁▁▁
## 6   871 ▇▁▁▁▁
## 7    25 ▇▁▁▁▁
## ```
## </details>
## 
## **Missing values (top 20):**
## 
## 
## 
## **Numeric summary (mean/median/sd/min/Q1/Q3/max):**
## 
## 
## 
## **Distributions for numeric features (showing 7 of 7):**

## 
## **Top-25 levels for categorical features (showing 10 of 10):**

## 
## **Correlation heatmap (numeric features):**

## 
## 
## **Outlier percentage (IQR rule):**
## 
## 
## 
## **Target distribution: `y`**

cat("\n\n---\nAll analyses complete. Plots and tables are embedded above.\n")

## 
## 
## ---
## All analyses complete. Plots and tables are embedded above.

Assignment 1: Exploratory Data Analysis

Said Naqwe

2025-10-08