This document contains all code and figures for Assignment 1, to accompany my essay.
I began by loading set of libraries and the raw dataset:
library(tidyverse)
library(ggplot2)
library(purrr)
library(knitr)
library(scales)
library(reshape2)
library(GGally)
df <- read_delim("https://raw.githubusercontent.com/AmandaSFox/DATA622/refs/heads/main/Assignment_1/bank-full.csv",
delim = ";",
quote = "\"")
Glimpse provides a quick and easy initial look at the data. Missing data and duplicate records were verified, and column names changed to be more intuitive based on the documentation:
glimpse(df)
## Rows: 45,211
## Columns: 17
## $ age <dbl> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
# missing data
colSums(is.na(df))
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
# duplicate records
nrow(df) - nrow(distinct(df))
## [1] 0
# change col names based on documentation
df_final <- df %>%
set_names("Age","Occupation","Marital_Status","Education",
"In_Default","Avg_Balance","Housing_Loan","Personal_Loan",
"Contact_Type","Day","Month","Duration","Contacts_This_Campaign",
"Days_Since_Last_Campaign","Contacts_Before_This_Campaign",
"Previous_Outcome","Y")
Simple frequency distributions were done for all categorical features to check for class imbalance and to get to know our data better, keeping our business perspective in mind.
# categorical column list
cat_cols <- df_final %>%
select(where(is.character)) %>%
names()
for (col in cat_cols) {
df_final %>%
count(.data[[col]]) %>%
mutate(Pct = round((n / sum(n)) * 100, 1)) %>%
arrange(desc(Pct)) %>%
print()
}
## # A tibble: 12 × 3
## Occupation n Pct
## <chr> <int> <dbl>
## 1 blue-collar 9732 21.5
## 2 management 9458 20.9
## 3 technician 7597 16.8
## 4 admin. 5171 11.4
## 5 services 4154 9.2
## 6 retired 2264 5
## 7 self-employed 1579 3.5
## 8 entrepreneur 1487 3.3
## 9 unemployed 1303 2.9
## 10 housemaid 1240 2.7
## 11 student 938 2.1
## 12 unknown 288 0.6
## # A tibble: 3 × 3
## Marital_Status n Pct
## <chr> <int> <dbl>
## 1 married 27214 60.2
## 2 single 12790 28.3
## 3 divorced 5207 11.5
## # A tibble: 4 × 3
## Education n Pct
## <chr> <int> <dbl>
## 1 secondary 23202 51.3
## 2 tertiary 13301 29.4
## 3 primary 6851 15.2
## 4 unknown 1857 4.1
## # A tibble: 2 × 3
## In_Default n Pct
## <chr> <int> <dbl>
## 1 no 44396 98.2
## 2 yes 815 1.8
## # A tibble: 2 × 3
## Housing_Loan n Pct
## <chr> <int> <dbl>
## 1 yes 25130 55.6
## 2 no 20081 44.4
## # A tibble: 2 × 3
## Personal_Loan n Pct
## <chr> <int> <dbl>
## 1 no 37967 84
## 2 yes 7244 16
## # A tibble: 3 × 3
## Contact_Type n Pct
## <chr> <int> <dbl>
## 1 cellular 29285 64.8
## 2 unknown 13020 28.8
## 3 telephone 2906 6.4
## # A tibble: 12 × 3
## Month n Pct
## <chr> <int> <dbl>
## 1 may 13766 30.4
## 2 jul 6895 15.3
## 3 aug 6247 13.8
## 4 jun 5341 11.8
## 5 nov 3970 8.8
## 6 apr 2932 6.5
## 7 feb 2649 5.9
## 8 jan 1403 3.1
## 9 oct 738 1.6
## 10 sep 579 1.3
## 11 mar 477 1.1
## 12 dec 214 0.5
## # A tibble: 4 × 3
## Previous_Outcome n Pct
## <chr> <int> <dbl>
## 1 unknown 36959 81.7
## 2 failure 4901 10.8
## 3 other 1840 4.1
## 4 success 1511 3.3
## # A tibble: 2 × 3
## Y n Pct
## <chr> <int> <dbl>
## 1 no 39922 88.3
## 2 yes 5289 11.7
# Selected Plots for Essay
plot_label <- df_final %>%
ggplot(aes(x = Y, fill = Y)) +
geom_bar(fill="darkseagreen") +
labs(title = "") +
scale_y_continuous(labels = scales::comma, breaks = seq(0, 50000, by = 5000)) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.y = element_text(size = 14))
print(plot_label)
plot_occupation <- df_final %>%
ggplot(aes(x = Occupation, fill = Y)) +
geom_bar(fill="gold") +
scale_y_continuous(labels = scales::comma, breaks = seq(0, 50000, by = 1000)) +
labs(title = "",) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.y = element_text(size = 14))
print(plot_occupation)
Next, I evaluated the numeric variables for central tendency, spread, and outliers, while continuing to leverage business knowledge to ensure the data was reasonable.
# create df of numeric variables except Day (not meaningful for correlations)
df_num <- df_final %>%
select(-Day) %>%
select(where(is.numeric))
#---------------------------------
# Age
summary(df_num$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 33.00 39.00 40.94 48.00 95.00
plot_age <- df_num %>%
ggplot(aes(Age))+
geom_histogram(bins = 40,fill="deepskyblue4") +
scale_x_continuous(breaks = seq(0, 100, by = 5)) +
scale_y_continuous(labels = scales::comma, breaks = seq(0, 50000, by = 500)) +
labs(title = "Contact Age")+
theme_minimal() +
theme(legend.position = "none",
title = element_text(size =16),
axis.text.x = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.y = element_text(size = 14))
print(plot_age)
#---------------------------------
# Avg Balance
summary(df_num$Avg_Balance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8019 72 448 1362 1428 102127
df_num %>%
ggplot(aes(x = Avg_Balance)) +
geom_histogram(bins = 1000, fill = "gray80") +
geom_vline(xintercept = 0, color="black", linetype = "dashed") +
coord_cartesian(c(-1000,12000)) +
scale_x_continuous(labels = scales::comma,
breaks = seq(-1000, 12000, by = 2000)) +
scale_y_continuous(labels = scales::comma,
breaks = seq(0, 50000, by = 1000)) +
labs(title = "Avg Annual Balance",
subtitle = "Zoomed-In: Actual Max = 104K") +
theme_minimal() +
theme(legend.position = "none",
title = element_text(size = 16),
axis.text.x = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.y = element_text(size = 14))
#---------------------------------
# Call Duration
summary(df_num$Duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 103.0 180.0 258.2 319.0 4918.0
ggplot(df_num, aes(x = Duration)) +
geom_histogram(bins = 60, fill = "skyblue") +
theme_minimal() +
coord_cartesian(c(-100,2000)) +
scale_x_continuous(breaks = seq(0, 2000, by = 200)) +
labs(title = "Call Duration in Seconds: Zoomed")
#---------------------------------
# Contacts Before This Campaign
summary(df_num$Contacts_Before_This_Campaign)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5803 0.0000 275.0000
df_final %>%
count(Contacts_Before_This_Campaign) %>%
# mutate(Pct = round((n / sum(n)) * 100, 1))
mutate(
Contacts = ifelse(Contacts_Before_This_Campaign <= 5,
as.character(Contacts_Before_This_Campaign),
">5"),
Pct = round((n / sum(n)) * 100, 1)
) %>%
group_by(Contacts) %>%
summarise(n = sum(n), Pct = sum(Pct)) %>%
arrange(as.numeric(Contacts)) %>%
print(n = Inf) # Show all rows (helpful in case there's 0-9 and "Other")
## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Contacts)`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 7 × 3
## Contacts n Pct
## <chr> <int> <dbl>
## 1 0 36954 81.7
## 2 1 2772 6.1
## 3 2 2106 4.7
## 4 3 1142 2.5
## 5 4 714 1.6
## 6 5 459 1
## 7 >5 1064 2
#---------------------------------
# Contacts This Campaign
summary(df_num$Contacts_This_Campaign)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.764 3.000 63.000
df_final %>%
count(Contacts_This_Campaign) %>%
# mutate(Pct = round((n / sum(n)) * 100, 1))
mutate(
Contacts = ifelse(Contacts_This_Campaign <= 9,
as.character(Contacts_This_Campaign),
">9"),
Pct = round((n / sum(n)) * 100, 1)
) %>%
group_by(Contacts) %>%
summarise(n = sum(n), Pct = sum(Pct)) %>%
arrange(as.numeric(Contacts)) %>%
print(n = Inf) # Show all rows (helpful in case there's 0-9 and "Other")
## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Contacts)`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 10 × 3
## Contacts n Pct
## <chr> <int> <dbl>
## 1 1 17544 38.8
## 2 2 12505 27.7
## 3 3 5521 12.2
## 4 4 3522 7.8
## 5 5 1764 3.9
## 6 6 1291 2.9
## 7 7 735 1.6
## 8 8 540 1.2
## 9 9 327 0.7
## 10 >9 1462 2.9
#---------------------------------
# Days Since Last Campaign
summary(df_num$Days_Since_Last_Campaign)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.0 -1.0 -1.0 40.2 -1.0 871.0
df_final %>%
count(Days_Since_Last_Campaign) %>%
# mutate(Pct = round((n / sum(n)) * 100, 1))
mutate(
Days = ifelse(Days_Since_Last_Campaign <= 5,
as.character(Days_Since_Last_Campaign),
">5"),
Pct = round((n / sum(n)) * 100, 1)
) %>%
group_by(Days) %>%
summarise(n = sum(n), Pct = sum(Pct)) %>%
arrange(as.numeric(Days)) %>%
print()
## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Days)`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 7 × 3
## Days n Pct
## <chr> <int> <dbl>
## 1 -1 36954 81.7
## 2 1 15 0
## 3 2 37 0.1
## 4 3 1 0
## 5 4 2 0
## 6 5 11 0
## 7 >5 8191 14.4
# Remove -1 before charting
df_days_gt_0 <- df_num %>%
select(Days_Since_Last_Campaign) %>%
filter(Days_Since_Last_Campaign > 0)
ggplot(df_days_gt_0, aes(x = Days_Since_Last_Campaign)) +
geom_histogram(bins = 60, fill = "skyblue") +
theme_minimal() +
coord_cartesian(c(-100,2000)) +
scale_x_continuous(breaks = seq(0, 2000, by = 90)) +
labs(title = "Days Since Last Campaign: >0 Days")
A correlation matrix was computed to identify relationships between numeric features. No significant correlations were found.
Bar charts and box plots were used to compare categorical/numeric and categorical/categorical pairs. Most pairs were not meaningful but several meaningful relationships were found, with some plots created for the essay.
# Numeric/Numeric
cor_matrix <- df_num %>%
cor(use = "pairwise.complete.obs")
# table of pairs with coefficients
cor_df <- as.data.frame(as.table(cor_matrix)) %>%
filter(Var1 != Var2) %>%
filter(abs(Freq) > 0.1) %>%
arrange(desc(abs(Freq)))
cor_df
## Var1 Var2 Freq
## 1 Contacts_Before_This_Campaign Days_Since_Last_Campaign 0.4548196
## 2 Days_Since_Last_Campaign Contacts_Before_This_Campaign 0.4548196
# plot
melt_cor <- melt(cor_matrix)
ggplot(melt_cor, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
coord_fixed()
#---------------------------------
# Categorical/Numeric Pairs
# (LOOP CREATES CHARTS FOR ALL PAIRS - COMMENTED OUT, very long)
#---------------------------------
num_cols <- df_num %>%
names()
#for (cat in cat_cols) {
# for (num in num_cols) {
# p <- ggplot(df_final,
# aes(x = .data[[cat]],
# y = .data[[num]])) +
# geom_boxplot(outlier.color = "red", outlier.shape = 1) +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
# labs(title = paste(num, "by", cat))
# print(p)
# }
#}
# SAMPLE FOR ESSAY
# call duration by Y
plot_duration_y <- df_final %>%
ggplot(aes(x = Y, y = Duration, fill = Y)) +
geom_boxplot(outlier.color = "red", outlier.shape = 1) +
scale_y_continuous(labels = scales::comma) +
labs(title = "Call Duration by Response (Y)",
x = "Response (Y)",
y = "Call Duration (Seconds)") +
coord_flip() +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(size = 16, face = "bold"),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14))
print(plot_duration_y)
#---------------------------------
# Categorical/Categorical Pairs
# (ALL PAIRS - COMMENTED OUT, very long!)
#---------------------------------
#if (length(cat_cols) > 1) {
# for (i in 1:(length(cat_cols) - 1)) {
# for (j in (i + 1):length(cat_cols)) {
# cat1 <- cat_cols[i]
# cat2 <- cat_cols[j]
#
# p <- ggplot(df_final,
# aes(x = .data[[cat1]],
# fill = .data[[cat2]])) +
# geom_bar(position = "fill") +
# scale_y_continuous(labels = scales::label_percent()) +
# labs(title = paste("Proportion of", cat2, "within", cat1),
# y = "Proportion") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
#
# print(p)
# }
# }
#}
# SAMPLE FOR ESSAY
# occupation by education
plot_ed_occ <- df_final %>%
ggplot(aes(x = Occupation, fill = Education)) +
geom_bar(position = "fill") +
labs(title = "Occupation by Education",
x = "Occupation",
y = "") +
scale_y_continuous(labels = scales::percent_format(), breaks = pretty_breaks(6)) +
theme_minimal() +
theme(legend.position = "right",
legend.text = element_text(size = 14),
legend.title = element_text(size = 14),
title = element_text(size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.y = element_text(size = 14))
print(plot_ed_occ)
This data was extremly clean, but based on the EDA above, I made two improvements:
Replaced -1 with NA in “Days Since Last Campaign” to more accurately identify first time contacts and avoid math issues
Created two new customer segmentation features to combine related features (occupation/education, borrowing profile) and simplify modeling
# Replace -1 with NA
df_preprocessed <- df_final %>%
mutate(Days_Since_Last_Campaign =
ifelse(Days_Since_Last_Campaign == -1,
NA,
Days_Since_Last_Campaign))
# Results: Original and Revised
df_final %>%
count(Days_Since_Last_Campaign) %>%
mutate(
Days = ifelse(Days_Since_Last_Campaign <= 5,
as.character(Days_Since_Last_Campaign),
">5"),
Pct = round((n / sum(n)) * 100, 1)
) %>%
group_by(Days) %>%
summarise(n = sum(n), Pct = sum(Pct)) %>%
arrange(as.numeric(Days)) %>%
print()
## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Days)`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 7 × 3
## Days n Pct
## <chr> <int> <dbl>
## 1 -1 36954 81.7
## 2 1 15 0
## 3 2 37 0.1
## 4 3 1 0
## 5 4 2 0
## 6 5 11 0
## 7 >5 8191 14.4
df_preprocessed %>%
count(Days_Since_Last_Campaign) %>%
mutate(
Days = ifelse(Days_Since_Last_Campaign <= 5,
as.character(Days_Since_Last_Campaign),
">5"),
Pct = round((n / sum(n)) * 100, 1)
) %>%
group_by(Days) %>%
summarise(n = sum(n), Pct = sum(Pct)) %>%
arrange(as.numeric(Days)) %>%
print()
## Warning: There was 1 warning in `arrange()`.
## ℹ In argument: `..1 = as.numeric(Days)`.
## Caused by warning:
## ! NAs introduced by coercion
## # A tibble: 7 × 3
## Days n Pct
## <chr> <int> <dbl>
## 1 1 15 0
## 2 2 37 0.1
## 3 3 1 0
## 4 4 2 0
## 5 5 11 0
## 6 >5 8191 14.4
## 7 <NA> 36954 81.7
#---------------------------------
# occupation/education feature
df_preprocessed <- df_preprocessed %>%
mutate(Occupation_Education = paste(Occupation, Education, sep = "_"))
df_preprocessed %>%
count(Occupation_Education, sort = TRUE) %>%
mutate(Pct = round((n / sum(n)) * 100, 1))
## # A tibble: 48 × 3
## Occupation_Education n Pct
## <chr> <int> <dbl>
## 1 management_tertiary 7801 17.3
## 2 blue-collar_secondary 5371 11.9
## 3 technician_secondary 5229 11.6
## 4 admin._secondary 4219 9.3
## 5 blue-collar_primary 3758 8.3
## 6 services_secondary 3457 7.6
## 7 technician_tertiary 1968 4.4
## 8 management_secondary 1121 2.5
## 9 retired_secondary 984 2.2
## 10 self-employed_tertiary 833 1.8
## # ℹ 38 more rows
#---------------------------------
# loan profile feature: default, housing, personal loans
df_preprocessed <- df_preprocessed %>%
mutate(Loan_Profile = case_when(
Housing_Loan == "yes" & Personal_Loan == "yes" & In_Default == "yes" ~ "Both Loans - In Default",
Housing_Loan == "yes" & Personal_Loan == "yes" & In_Default == "no" ~ "Both Loans - No Default",
Housing_Loan == "yes" & Personal_Loan == "no" & In_Default == "yes" ~ "Housing Loan - In Default",
Housing_Loan == "yes" & Personal_Loan == "no" & In_Default == "no" ~ "Housing Loan - No Default",
Housing_Loan == "no" & Personal_Loan == "yes" & In_Default == "yes" ~ "Personal Loan - In Default",
Housing_Loan == "no" & Personal_Loan == "yes" & In_Default == "no" ~ "Personal Loan - No Default",
Housing_Loan == "no" & Personal_Loan == "no" & In_Default == "yes" ~ "No Loans - In Default",
TRUE ~ "No Loans - No Default"
))
df_preprocessed %>%
count(Loan_Profile, sort = TRUE) %>%
mutate(Pct = round((n / sum(n)) * 100, 1))
## # A tibble: 8 × 3
## Loan_Profile n Pct
## <chr> <int> <dbl>
## 1 Housing Loan - No Default 20461 45.3
## 2 No Loans - No Default 16992 37.6
## 3 Both Loans - No Default 4234 9.4
## 4 Personal Loan - No Default 2709 6
## 5 Housing Loan - In Default 302 0.7
## 6 No Loans - In Default 212 0.5
## 7 Personal Loan - In Default 168 0.4
## 8 Both Loans - In Default 133 0.3