DATA 622 Assignment 1

Intent

This document contains all code and figures for Assignment 1, to accompany my essay.

Libraries and Data

I began by loading set of libraries and the raw dataset:

library(tidyverse)
library(ggplot2)
library(purrr)
library(knitr)
library(scales)
library(reshape2)
library(GGally)

df <- read_delim("https://raw.githubusercontent.com/AmandaSFox/DATA622/refs/heads/main/Assignment_1/bank-full.csv",
                 delim = ";",
                 quote = "\"")

Overview of Data

Glimpse provides a quick and easy initial look at the data. Missing data and duplicate records were verified, and column names changed to be more intuitive based on the documentation:

glimpse(df)

## Rows: 45,211
## Columns: 17
## $ age       <dbl> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job       <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital   <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance   <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing   <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan      <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact   <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day       <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month     <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration  <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays     <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome  <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …

# missing data
colSums(is.na(df))

##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0

# duplicate records
nrow(df) - nrow(distinct(df))

## [1] 0

# change col names based on documentation
df_final <- df %>% 
  set_names("Age","Occupation","Marital_Status","Education",
            "In_Default","Avg_Balance","Housing_Loan","Personal_Loan",
            "Contact_Type","Day","Month","Duration","Contacts_This_Campaign",
            "Days_Since_Last_Campaign","Contacts_Before_This_Campaign",
            "Previous_Outcome","Y")

Categorical Variables

Simple frequency distributions were done for all categorical features to check for class imbalance and to get to know our data better, keeping our business perspective in mind.

# categorical column list
cat_cols <- df_final %>% 
  select(where(is.character)) %>%  
  names()

for (col in cat_cols) {
   df_final %>%
    count(.data[[col]]) %>%
    mutate(Pct = round((n / sum(n)) * 100, 1)) %>%
    arrange(desc(Pct)) %>% 
    print()
}

## # A tibble: 12 × 3
##    Occupation        n   Pct
##    <chr>         <int> <dbl>
##  1 blue-collar    9732  21.5
##  2 management     9458  20.9
##  3 technician     7597  16.8
##  4 admin.         5171  11.4
##  5 services       4154   9.2
##  6 retired        2264   5  
##  7 self-employed  1579   3.5
##  8 entrepreneur   1487   3.3
##  9 unemployed     1303   2.9
## 10 housemaid      1240   2.7
## 11 student         938   2.1
## 12 unknown         288   0.6
## # A tibble: 3 × 3
##   Marital_Status     n   Pct
##   <chr>          <int> <dbl>
## 1 married        27214  60.2
## 2 single         12790  28.3
## 3 divorced        5207  11.5
## # A tibble: 4 × 3
##   Education     n   Pct
##   <chr>     <int> <dbl>
## 1 secondary 23202  51.3
## 2 tertiary  13301  29.4
## 3 primary    6851  15.2
## 4 unknown    1857   4.1
## # A tibble: 2 × 3
##   In_Default     n   Pct
##   <chr>      <int> <dbl>
## 1 no         44396  98.2
## 2 yes          815   1.8
## # A tibble: 2 × 3
##   Housing_Loan     n   Pct
##   <chr>        <int> <dbl>
## 1 yes          25130  55.6
## 2 no           20081  44.4
## # A tibble: 2 × 3
##   Personal_Loan     n   Pct
##   <chr>         <int> <dbl>
## 1 no            37967    84
## 2 yes            7244    16
## # A tibble: 3 × 3
##   Contact_Type     n   Pct
##   <chr>        <int> <dbl>
## 1 cellular     29285  64.8
## 2 unknown      13020  28.8
## 3 telephone     2906   6.4
## # A tibble: 12 × 3
##    Month     n   Pct
##    <chr> <int> <dbl>
##  1 may   13766  30.4
##  2 jul    6895  15.3
##  3 aug    6247  13.8
##  4 jun    5341  11.8
##  5 nov    3970   8.8
##  6 apr    2932   6.5
##  7 feb    2649   5.9
##  8 jan    1403   3.1
##  9 oct     738   1.6
## 10 sep     579   1.3
## 11 mar     477   1.1
## 12 dec     214   0.5
## # A tibble: 4 × 3
##   Previous_Outcome     n   Pct
##   <chr>            <int> <dbl>
## 1 unknown          36959  81.7
## 2 failure           4901  10.8
## 3 other             1840   4.1
## 4 success           1511   3.3
## # A tibble: 2 × 3
##   Y         n   Pct
##   <chr> <int> <dbl>
## 1 no    39922  88.3
## 2 yes    5289  11.7

# Selected Plots for Essay
plot_label <- df_final %>% 
  ggplot(aes(x = Y, fill = Y)) +
  geom_bar(fill="darkseagreen") +
  labs(title = "") +
  scale_y_continuous(labels = scales::comma, breaks = seq(0, 50000, by = 5000)) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 14),
        axis.title.x = element_text(size = 16),
        axis.title.y = element_text(size = 16),
        axis.text.y = element_text(size = 14))

print(plot_label)

plot_occupation <- df_final %>% 
  ggplot(aes(x = Occupation, fill = Y)) +
  geom_bar(fill="gold") +
  scale_y_continuous(labels = scales::comma, breaks = seq(0, 50000, by = 1000)) +
  labs(title = "",) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
        axis.title.x = element_text(size = 16),
        axis.title.y = element_text(size = 16),
        axis.text.y = element_text(size = 14))

print(plot_occupation)

Numeric Variables

Next, I evaluated the numeric variables for central tendency, spread, and outliers, while continuing to leverage business knowledge to ensure the data was reasonable.

# create df of numeric variables except Day (not meaningful for correlations)
df_num <- df_final %>% 
  select(-Day) %>% 
  select(where(is.numeric))

#---------------------------------
# Age
summary(df_num$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   33.00   39.00   40.94   48.00   95.00

plot_age <- df_num %>% 
  ggplot(aes(Age))+
  geom_histogram(bins = 40,fill="deepskyblue4") +
  scale_x_continuous(breaks = seq(0, 100, by = 5)) +
  scale_y_continuous(labels = scales::comma, breaks = seq(0, 50000, by = 500)) +
  labs(title = "Contact Age")+
  theme_minimal() +
  theme(legend.position = "none",
      title = element_text(size =16),
      axis.text.x = element_text(size = 14),
      axis.title.x = element_text(size = 16),
      axis.title.y = element_text(size = 16),
      axis.text.y = element_text(size = 14))

print(plot_age)

#---------------------------------
# Avg Balance
summary(df_num$Avg_Balance)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -8019      72     448    1362    1428  102127

df_num %>% 
  ggplot(aes(x = Avg_Balance)) +
  geom_histogram(bins = 1000, fill = "gray80") +
  geom_vline(xintercept = 0, color="black", linetype = "dashed") +
  coord_cartesian(c(-1000,12000)) +
  scale_x_continuous(labels = scales::comma, 
                     breaks = seq(-1000, 12000, by = 2000)) +
  scale_y_continuous(labels = scales::comma, 
                     breaks = seq(0, 50000, by = 1000)) +
  labs(title = "Avg Annual Balance",
      subtitle = "Zoomed-In: Actual Max = 104K") +
  theme_minimal() +
  theme(legend.position = "none",
        title = element_text(size = 16),
        axis.text.x = element_text(size = 14),
        axis.title.x = element_text(size = 16),
        axis.title.y = element_text(size = 16),
        axis.text.y = element_text(size = 14))

#---------------------------------
# Call Duration
summary(df_num$Duration)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   103.0   180.0   258.2   319.0  4918.0

ggplot(df_num, aes(x = Duration)) +
  geom_histogram(bins = 60, fill = "skyblue") +
  theme_minimal() +
  coord_cartesian(c(-100,2000)) +
  scale_x_continuous(breaks = seq(0, 2000, by = 200)) +
  labs(title = "Call Duration in Seconds: Zoomed")

#---------------------------------
# Contacts Before This Campaign
summary(df_num$Contacts_Before_This_Campaign)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0000   0.0000   0.5803   0.0000 275.0000

df_final %>% 
  count(Contacts_Before_This_Campaign) %>% 
  #  mutate(Pct = round((n / sum(n)) * 100, 1))
  mutate(
    Contacts = ifelse(Contacts_Before_This_Campaign <= 5, 
                      as.character(Contacts_Before_This_Campaign), 
                      ">5"),
    Pct = round((n / sum(n)) * 100, 1)
  ) %>%
  group_by(Contacts) %>%
  summarise(n = sum(n), Pct = sum(Pct)) %>%
  arrange(as.numeric(Contacts)) %>% 
  print(n = Inf)  # Show all rows (helpful in case there's 0-9 and "Other")

## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Contacts)`.
## Caused by warning:
## ! NAs introduced by coercion

## # A tibble: 7 × 3
##   Contacts     n   Pct
##   <chr>    <int> <dbl>
## 1 0        36954  81.7
## 2 1         2772   6.1
## 3 2         2106   4.7
## 4 3         1142   2.5
## 5 4          714   1.6
## 6 5          459   1  
## 7 >5        1064   2

#---------------------------------
# Contacts This Campaign
summary(df_num$Contacts_This_Campaign)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.764   3.000  63.000

df_final %>% 
  count(Contacts_This_Campaign) %>% 
  #  mutate(Pct = round((n / sum(n)) * 100, 1))
  mutate(
    Contacts = ifelse(Contacts_This_Campaign <= 9, 
                      as.character(Contacts_This_Campaign), 
                      ">9"),
    Pct = round((n / sum(n)) * 100, 1)
  ) %>%
  group_by(Contacts) %>%
  summarise(n = sum(n), Pct = sum(Pct)) %>%
  arrange(as.numeric(Contacts)) %>% 
  print(n = Inf)  # Show all rows (helpful in case there's 0-9 and "Other")

## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Contacts)`.
## Caused by warning:
## ! NAs introduced by coercion

## # A tibble: 10 × 3
##    Contacts     n   Pct
##    <chr>    <int> <dbl>
##  1 1        17544  38.8
##  2 2        12505  27.7
##  3 3         5521  12.2
##  4 4         3522   7.8
##  5 5         1764   3.9
##  6 6         1291   2.9
##  7 7          735   1.6
##  8 8          540   1.2
##  9 9          327   0.7
## 10 >9        1462   2.9

#---------------------------------
# Days Since Last Campaign
summary(df_num$Days_Since_Last_Campaign)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -1.0    -1.0    -1.0    40.2    -1.0   871.0

df_final %>% 
  count(Days_Since_Last_Campaign) %>% 
  #  mutate(Pct = round((n / sum(n)) * 100, 1))
  mutate(
    Days = ifelse(Days_Since_Last_Campaign <= 5, 
                  as.character(Days_Since_Last_Campaign), 
                  ">5"),
    Pct = round((n / sum(n)) * 100, 1)
  ) %>%
  group_by(Days) %>%
  summarise(n = sum(n), Pct = sum(Pct)) %>%
  arrange(as.numeric(Days)) %>% 
  print()

## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Days)`.
## Caused by warning:
## ! NAs introduced by coercion

## # A tibble: 7 × 3
##   Days      n   Pct
##   <chr> <int> <dbl>
## 1 -1    36954  81.7
## 2 1        15   0  
## 3 2        37   0.1
## 4 3         1   0  
## 5 4         2   0  
## 6 5        11   0  
## 7 >5     8191  14.4

# Remove -1 before charting
df_days_gt_0 <- df_num %>% 
  select(Days_Since_Last_Campaign) %>% 
  filter(Days_Since_Last_Campaign > 0)

ggplot(df_days_gt_0, aes(x = Days_Since_Last_Campaign)) +
  geom_histogram(bins = 60, fill = "skyblue") +
  theme_minimal() +
  coord_cartesian(c(-100,2000)) +
  scale_x_continuous(breaks = seq(0, 2000, by = 90)) +
  labs(title = "Days Since Last Campaign: >0 Days")

Relationships Between Variables

A correlation matrix was computed to identify relationships between numeric features. No significant correlations were found.

Bar charts and box plots were used to compare categorical/numeric and categorical/categorical pairs. Most pairs were not meaningful but several meaningful relationships were found, with some plots created for the essay.

# Numeric/Numeric
cor_matrix <- df_num %>% 
  cor(use = "pairwise.complete.obs")

# table of pairs with coefficients
cor_df <- as.data.frame(as.table(cor_matrix)) %>%
  filter(Var1 != Var2) %>%                    
  filter(abs(Freq) > 0.1) %>%
  arrange(desc(abs(Freq)))

cor_df

##                            Var1                          Var2      Freq
## 1 Contacts_Before_This_Campaign      Days_Since_Last_Campaign 0.4548196
## 2      Days_Since_Last_Campaign Contacts_Before_This_Campaign 0.4548196

# plot
melt_cor <- melt(cor_matrix)

ggplot(melt_cor, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  coord_fixed()

#---------------------------------
# Categorical/Numeric Pairs 
# (LOOP CREATES CHARTS FOR ALL PAIRS - COMMENTED OUT, very long)
#---------------------------------

num_cols <- df_num %>% 
  names()

#for (cat in cat_cols) {
#  for (num in num_cols) {
#    p <- ggplot(df_final, 
#                aes(x = .data[[cat]], 
#                    y = .data[[num]])) +
#      geom_boxplot(outlier.color = "red", outlier.shape = 1) +
#      theme_minimal() +
#      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
#      labs(title = paste(num, "by", cat))
    
#    print(p)
#  }
#}

# SAMPLE FOR ESSAY
# call duration by Y
plot_duration_y <- df_final %>%  
  ggplot(aes(x = Y, y = Duration, fill = Y)) +
  geom_boxplot(outlier.color = "red", outlier.shape = 1) +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Call Duration by Response (Y)",
       x = "Response (Y)",
       y = "Call Duration (Seconds)") +
  coord_flip() +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(size = 16, face = "bold"),
        axis.title.x = element_text(size = 16),
        axis.title.y = element_text(size = 16),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14))

print(plot_duration_y)

#---------------------------------
# Categorical/Categorical Pairs
# (ALL PAIRS - COMMENTED OUT, very long!)
#---------------------------------

#if (length(cat_cols) > 1) {
#  for (i in 1:(length(cat_cols) - 1)) {
#    for (j in (i + 1):length(cat_cols)) {
#      cat1 <- cat_cols[i]
#      cat2 <- cat_cols[j]
#      
#      p <- ggplot(df_final, 
#                  aes(x = .data[[cat1]], 
#                      fill = .data[[cat2]])) +
#        geom_bar(position = "fill") +
#        scale_y_continuous(labels = scales::label_percent()) +
#        labs(title = paste("Proportion of", cat2, "within", cat1),
#             y = "Proportion") +
#        theme_minimal() +
#        theme(axis.text.x = element_text(angle = 45, hjust = 1))
#      
#      print(p)
#    }
#  }
#}

# SAMPLE FOR ESSAY
# occupation by education
plot_ed_occ <- df_final %>% 
  ggplot(aes(x = Occupation, fill = Education)) +
    geom_bar(position = "fill") +
    labs(title = "Occupation by Education",
         x = "Occupation",
         y = "") +
    scale_y_continuous(labels = scales::percent_format(), breaks = pretty_breaks(6)) +
    theme_minimal() +
    theme(legend.position = "right",
          legend.text = element_text(size = 14),
          legend.title = element_text(size = 14),
          title = element_text(size = 16),
          axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
          axis.title.x = element_text(size = 16),
          axis.title.y = element_text(size = 16),
          axis.text.y = element_text(size = 14))

print(plot_ed_occ)

Pre-Processing

This data was extremly clean, but based on the EDA above, I made two improvements:

Replaced -1 with NA in “Days Since Last Campaign” to more accurately identify first time contacts and avoid math issues
Created two new customer segmentation features to combine related features (occupation/education, borrowing profile) and simplify modeling

# Replace -1 with NA
df_preprocessed <- df_final %>%
  mutate(Days_Since_Last_Campaign = 
           ifelse(Days_Since_Last_Campaign == -1, 
                  NA, 
                  Days_Since_Last_Campaign))

# Results: Original and Revised
df_final %>% 
  count(Days_Since_Last_Campaign) %>% 
  mutate(
    Days = ifelse(Days_Since_Last_Campaign <= 5, 
                  as.character(Days_Since_Last_Campaign), 
                  ">5"),
    Pct = round((n / sum(n)) * 100, 1)
  ) %>%
  group_by(Days) %>%
  summarise(n = sum(n), Pct = sum(Pct)) %>%
  arrange(as.numeric(Days)) %>% 
  print()

## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Days)`.
## Caused by warning:
## ! NAs introduced by coercion

## # A tibble: 7 × 3
##   Days      n   Pct
##   <chr> <int> <dbl>
## 1 -1    36954  81.7
## 2 1        15   0  
## 3 2        37   0.1
## 4 3         1   0  
## 5 4         2   0  
## 6 5        11   0  
## 7 >5     8191  14.4

df_preprocessed %>% 
  count(Days_Since_Last_Campaign) %>% 
  mutate(
    Days = ifelse(Days_Since_Last_Campaign <= 5, 
                  as.character(Days_Since_Last_Campaign), 
                  ">5"),
    Pct = round((n / sum(n)) * 100, 1)
  ) %>%
  group_by(Days) %>%
  summarise(n = sum(n), Pct = sum(Pct)) %>%
  arrange(as.numeric(Days)) %>% 
  print()

## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Days)`.
## Caused by warning:
## ! NAs introduced by coercion

## # A tibble: 7 × 3
##   Days      n   Pct
##   <chr> <int> <dbl>
## 1 1        15   0  
## 2 2        37   0.1
## 3 3         1   0  
## 4 4         2   0  
## 5 5        11   0  
## 6 >5     8191  14.4
## 7 <NA>  36954  81.7

#---------------------------------
# occupation/education feature
df_preprocessed <- df_preprocessed %>%
  mutate(Occupation_Education = paste(Occupation, Education, sep = "_"))

df_preprocessed %>%
  count(Occupation_Education, sort = TRUE) %>%
  mutate(Pct = round((n / sum(n)) * 100, 1))

## # A tibble: 48 × 3
##    Occupation_Education       n   Pct
##    <chr>                  <int> <dbl>
##  1 management_tertiary     7801  17.3
##  2 blue-collar_secondary   5371  11.9
##  3 technician_secondary    5229  11.6
##  4 admin._secondary        4219   9.3
##  5 blue-collar_primary     3758   8.3
##  6 services_secondary      3457   7.6
##  7 technician_tertiary     1968   4.4
##  8 management_secondary    1121   2.5
##  9 retired_secondary        984   2.2
## 10 self-employed_tertiary   833   1.8
## # ℹ 38 more rows

#---------------------------------
# loan profile feature: default, housing, personal loans
df_preprocessed <- df_preprocessed %>%
  mutate(Loan_Profile = case_when(
    Housing_Loan == "yes" & Personal_Loan == "yes" & In_Default == "yes" ~ "Both Loans - In Default",
    Housing_Loan == "yes" & Personal_Loan == "yes" & In_Default == "no" ~ "Both Loans - No Default",
    Housing_Loan == "yes" & Personal_Loan == "no" & In_Default == "yes" ~ "Housing Loan - In Default",
    Housing_Loan == "yes" & Personal_Loan == "no" & In_Default == "no" ~ "Housing Loan - No Default",
    Housing_Loan == "no" & Personal_Loan == "yes" & In_Default == "yes" ~ "Personal Loan - In Default",
    Housing_Loan == "no" & Personal_Loan == "yes" & In_Default == "no" ~ "Personal Loan - No Default",
    Housing_Loan == "no" & Personal_Loan == "no" & In_Default == "yes" ~ "No Loans - In Default",
    TRUE ~ "No Loans - No Default"
  ))

df_preprocessed %>%
  count(Loan_Profile, sort = TRUE) %>%
  mutate(Pct = round((n / sum(n)) * 100, 1))

## # A tibble: 8 × 3
##   Loan_Profile                   n   Pct
##   <chr>                      <int> <dbl>
## 1 Housing Loan - No Default  20461  45.3
## 2 No Loans - No Default      16992  37.6
## 3 Both Loans - No Default     4234   9.4
## 4 Personal Loan - No Default  2709   6  
## 5 Housing Loan - In Default    302   0.7
## 6 No Loans - In Default        212   0.5
## 7 Personal Loan - In Default   168   0.4
## 8 Both Loans - In Default      133   0.3