Introduction

This RMD supports Assignment 01 for CUNY SPS DATA 622, where we explore a dataset from a Portuguese bank’s marketing campaign. The bank used phone calls to predict whether clients would subscribe to a term deposit. The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y) by applying machine learning techniques to analyze the data and uncover the most effective strategies for boosting customer subscriptions in future campaigns, but this code stops at the EDA.

Load Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(corrplot)
## corrplot 0.95 loaded
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Get Data

# Read csv from github
bank_df <- read.csv("https://raw.githubusercontent.com/evanskaylie/DATA622/refs/heads/main/bank-full.csv", sep = ";")

# Check the data
head(bank_df)
##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no unknown   5
## 2  44   technician  single secondary      no      29     yes   no unknown   5
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
## 5  33      unknown  single   unknown      no       1      no   no unknown   5
## 6  35   management married  tertiary      no     231     yes   no unknown   5
##   month duration campaign pdays previous poutcome  y
## 1   may      261        1    -1        0  unknown no
## 2   may      151        1    -1        0  unknown no
## 3   may       76        1    -1        0  unknown no
## 4   may       92        1    -1        0  unknown no
## 5   may      198        1    -1        0  unknown no
## 6   may      139        1    -1        0  unknown no

Exploratory Data Analysis

Are the features (columns) of your data correlated?

# Convert categorical variables to numeric for correlation
bank_df_num <- bank_df |>
  select(where(is.numeric))

# Calculate correlation matrix
corr_matrix <- cor(bank_df_num)

# Plot the correlation matrix using corrplot
corrplot(corr_matrix, method = "circle", type = "upper", tl.col = "salmon", tl.srt = 45)

It looks like the highest correlated features, in order, are previous~pdays [moderate positive], campaign~day [weak positive], pdays~day [weak negative], campaign~duration [weak negative], age~balance [weak positive], and previous~day [very weak negative].

What is the overall distribution of each variable?

# Get numeric variables only
numeric_vars <- bank_df |>
  select_if(is.numeric)
summary(numeric_vars)
##       age           balance            day           duration     
##  Min.   :18.00   Min.   : -8019   Min.   : 1.00   Min.   :   0.0  
##  1st Qu.:33.00   1st Qu.:    72   1st Qu.: 8.00   1st Qu.: 103.0  
##  Median :39.00   Median :   448   Median :16.00   Median : 180.0  
##  Mean   :40.94   Mean   :  1362   Mean   :15.81   Mean   : 258.2  
##  3rd Qu.:48.00   3rd Qu.:  1428   3rd Qu.:21.00   3rd Qu.: 319.0  
##  Max.   :95.00   Max.   :102127   Max.   :31.00   Max.   :4918.0  
##     campaign          pdays          previous       
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000  
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000  
##  Median : 2.000   Median : -1.0   Median :  0.0000  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803  
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000  
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000
# Plot distributions to visualize
numeric_vars |> 
  gather() |>
  ggplot(aes(value)) + 
  geom_histogram(bins = 20, fill = "lightblue", color = "salmon", size = 0.2) + 
  facet_wrap(~key, scales = "free_x") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Are there any outliers present?

# Define a function to identify outliers using IQR
find_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  # Identify outliers
  outliers <- which(x < lower_bound | x > upper_bound)
  
  return(outliers)
}

# Apply the function to each numeric variable and identify outliers
outliers_list <- numeric_vars |> 
  map(~find_outliers(.))

# Display outliers for each variable
# outliers_list # Result hidden for readability

Wowee that is a big (hidden) list. Let’s just get a count of how many outliers exist in each numerical variable.

# Count outliers
summary(outliers_list)
##          Length Class  Mode   
## age       487   -none- numeric
## balance  4729   -none- numeric
## day         0   -none- numeric
## duration 3235   -none- numeric
## campaign 3064   -none- numeric
## pdays    8257   -none- numeric
## previous 8257   -none- numeric

What are the relationships between different variables?

# Relationships between different variables
ggpairs(numeric_vars)

How about just the ones with notable correlation?

previous~pdays [moderate positive], campaign~day [weak positive], pdays~day [weak negative], campaign~duration [weak negative], age~balance [weak positive], and previous~day [very weak negative]

# Relationships between different variables
ggpairs(bank_df |>
          select(previous, pdays, campaign, day, duration)
        ) 

How are categorical variables distributed?

# Select categorical variables only
categorical_vars <- bank_df |>
  select_if(is.character)

# Compute counts for each category
categorical_counts <- categorical_vars |>
  gather() |>
  count(key, value)

# Plot with labels
ggplot(categorical_counts, aes(x = value, y = n)) + 
  geom_bar(stat = "identity", fill = "lightblue", color = "salmon", size = 0.2) + 
  geom_text(aes(label = n), hjust = -0.2, size = 2.5) +  # Smaller label size
  facet_wrap(~key, scales = "free_y") +
  coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(y = "Count", x = "Category", title = "Distribution of Categorical Variables")

Do any patterns or trends emerge in the data? 6 of 7 numerical features are left skewed

# See What is the overall distribution of each variable? above

What is the central tendency and spread of each variable?

# Central tendency and spread of numeric variables
summary(numeric_vars)
##       age           balance            day           duration     
##  Min.   :18.00   Min.   : -8019   Min.   : 1.00   Min.   :   0.0  
##  1st Qu.:33.00   1st Qu.:    72   1st Qu.: 8.00   1st Qu.: 103.0  
##  Median :39.00   Median :   448   Median :16.00   Median : 180.0  
##  Mean   :40.94   Mean   :  1362   Mean   :15.81   Mean   : 258.2  
##  3rd Qu.:48.00   3rd Qu.:  1428   3rd Qu.:21.00   3rd Qu.: 319.0  
##  Max.   :95.00   Max.   :102127   Max.   :31.00   Max.   :4918.0  
##     campaign          pdays          previous       
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000  
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000  
##  Median : 2.000   Median : -1.0   Median :  0.0000  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803  
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000  
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000

Are there any missing values and how significant are they?

# Check for missing values and their significance
missing_values <- colSums(is.na(bank_df))
cat("Missing values:\n", missing_values, "\n")
## Missing values:
##  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Yippee!