# Import required libraries and data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
## Warning: package 'knitr' was built under R version 4.3.3
url <- 'https://raw.githubusercontent.com/Marley-Myrianthopoulos/grad_school_data/refs/heads/main/bank-full.csv'
bank_data <- read.csv(url, sep = ';')
print(head(bank_data))
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no unknown 5
## 2 44 technician single secondary no 29 yes no unknown 5
## 3 33 entrepreneur married secondary no 2 yes yes unknown 5
## 4 47 blue-collar married unknown no 1506 yes no unknown 5
## 5 33 unknown single unknown no 1 no no unknown 5
## 6 35 management married tertiary no 231 yes no unknown 5
## month duration campaign pdays previous poutcome y
## 1 may 261 1 -1 0 unknown no
## 2 may 151 1 -1 0 unknown no
## 3 may 76 1 -1 0 unknown no
## 4 may 92 1 -1 0 unknown no
## 5 may 198 1 -1 0 unknown no
## 6 may 139 1 -1 0 unknown no
glimpse(bank_data)
## Rows: 45,211
## Columns: 17
## $ age <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
The data contains 45,211 observations of 17 variables. The variables include two different types of data:
age
, balance
,
day
, duration
, campaign
,
pdays
, and previous
variables.job
, marital
,
education
, default
, housing
,
loan
, contact
, month
,
poutcome
, and y
variables.glimpse(bank_data |> distinct())
## Rows: 45,211
## Columns: 17
## $ age <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
The distinct data frame has the same number of rows as the original data frame, so there are no duplicate rows.
There are no NAs in the data set.
print(sum(is.na(bank_data)))
## [1] 0
However, many of the values are entered as “unknown”. The distribution is shown below.
# Determine number of "unknown" values in each column
num_unknown <- c()
for (i in 1:ncol(bank_data)) {
num_unknown[i] = sum(bank_data[,i] == 'unknown')
}
unknown_df <- data.frame(
variable = colnames(bank_data),
num_unknown = num_unknown) |>
filter(num_unknown != 0) |>
arrange(num_unknown) |>
mutate(pct_unknown = round(num_unknown / nrow(bank_data) * 100, 2))
kable(unknown_df, caption = 'Unknown Values in Bank Data', format = 'pipe', col.names = c('Variable', 'Count', 'Percent'), align = 'lcc')
Variable | Count | Percent |
---|---|---|
job | 288 | 0.64 |
education | 1857 | 4.11 |
contact | 13020 | 28.80 |
poutcome | 36959 | 81.75 |
# Visualize the number of "unknown" values in each column
unknown_df |> ggplot(aes(x = reorder(variable, -num_unknown), y = num_unknown)) + geom_col()
The default
, housing
, loan
,
and y
columns are binary variables recorded as either “yes”
or “no”.
print(unique(bank_data$default))
## [1] "no" "yes"
print(unique(bank_data$housing))
## [1] "yes" "no"
print(unique(bank_data$loan))
## [1] "no" "yes"
print(unique(bank_data$y))
## [1] "no" "yes"
The distribution of these variables is shown below.
# Determine distribution of binary variables
binary_vars <- bank_data |>
select(c(default, housing, loan, y)) |>
pivot_longer(cols = c(default, housing, loan, y),
names_to = 'variable',
values_to = 'value') |>
group_by(variable, value) |>
mutate(count = n()) |>
ungroup() |>
distinct() |>
arrange(variable, value) |>
group_by(variable) |>
mutate(pct = round(count / sum(count) * 100, 2)) |>
ungroup()
kable(binary_vars, caption = 'Distribution of Binary Variables', format = 'pipe', col.names = c('Variable', 'Value', 'Count', 'Percent'), align = 'lccc')
Variable | Value | Count | Percent |
---|---|---|---|
default | no | 44396 | 98.20 |
default | yes | 815 | 1.80 |
housing | no | 20081 | 44.42 |
housing | yes | 25130 | 55.58 |
loan | no | 37967 | 83.98 |
loan | yes | 7244 | 16.02 |
y | no | 39922 | 88.30 |
y | yes | 5289 | 11.70 |
# Visualize the distribution of binary variables
binary_vars |> ggplot(aes(x = variable, y = count, fill = value)) + geom_col()
The job
, marital
, education
,
contact
, month
, and poutcome
variables are non-binary categorical variables. The distribution of each
variable is shown below.
# Visualize the distribution of categorical variables
bank_data |> ggplot(aes(x = job)) + geom_bar() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
bank_data |> ggplot(aes(x = marital)) + geom_bar()
bank_data |> ggplot(aes(x = education)) + geom_bar()
bank_data |> ggplot(aes(x = contact)) + geom_bar()
bank_data |> mutate(month = fct_relevel(month, 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec')) |> ggplot(aes(x = month)) + geom_bar()
bank_data |> ggplot(aes(x = poutcome)) + geom_bar()
None of these variables are evenly distributed.
# Visualize the distribution of numeric variables
numeric_bank_data <- bank_data |> select(age, balance, day, duration, campaign, pdays, previous)
numeric_bank_data |>
ggplot(aes(x = age)) +
geom_bar()
numeric_bank_data |>
ggplot(aes(x = balance)) +
geom_histogram(binwidth = 100)
numeric_bank_data |>
ggplot(aes(x = day)) +
geom_bar()
numeric_bank_data |>
ggplot(aes(x = duration)) +
geom_bar()
numeric_bank_data |>
ggplot(aes(x = campaign)) +
geom_bar()
numeric_bank_data |>
ggplot(aes(x = pdays)) +
geom_histogram(binwidth = 7)
numeric_bank_data |>
ggplot(aes(x = previous)) +
geom_bar()
# Create boxplots of numeric variables
numeric_bank_data |>
ggplot(aes(x = age)) +
geom_boxplot()
numeric_bank_data |>
ggplot(aes(x = balance)) +
geom_boxplot()
numeric_bank_data |>
ggplot(aes(x = day)) +
geom_boxplot()
numeric_bank_data |>
ggplot(aes(x = duration)) +
geom_boxplot()
numeric_bank_data |>
ggplot(aes(x = campaign)) +
geom_boxplot()
numeric_bank_data |>
ggplot(aes(x = pdays)) +
geom_boxplot()
numeric_bank_data |>
ggplot(aes(x = previous)) +
geom_boxplot()
There are significant outliers in the balance
,
duration
, and previous
variables. These are
explored below.
# Identify significant outliers in each variable
print(head(bank_data |> select(previous) |> arrange(desc(previous))))
## previous
## 1 275
## 2 58
## 3 55
## 4 51
## 5 41
## 6 40
print(head(bank_data |> select(duration) |> arrange(desc(duration))))
## duration
## 1 4918
## 2 3881
## 3 3785
## 4 3422
## 5 3366
## 6 3322
print(head(bank_data |> select(balance) |> arrange(desc(balance))))
## balance
## 1 102127
## 2 98417
## 3 81204
## 4 81204
## 5 71188
## 6 66721
# Check each combination of binary variables for correlation
bank_data$default = as.numeric(bank_data$default == "yes")
bank_data$housing = as.numeric(bank_data$housing == "yes")
bank_data$loan = as.numeric(bank_data$loan == "yes")
bank_data$y = as.numeric(bank_data$y == "yes")
binary_vars_1 <- c('default', 'default', 'default', 'housing', 'housing', 'loan')
binary_vars_2 <- c('housing', 'loan', 'y', 'loan', 'y', 'y')
binary_cors <- c(cor(bank_data$default, bank_data$housing, method = "pearson"), cor(bank_data$default, bank_data$loan, method = "pearson"), cor(bank_data$default, bank_data$y, method = "pearson"), cor(bank_data$housing, bank_data$loan, method = "pearson"), cor(bank_data$housing, bank_data$y, method = "pearson"), cor(bank_data$loan, bank_data$y, method = "pearson"))
binary_cor_df <- data.frame(var1 = binary_vars_1,
var2 = binary_vars_2,
phi = binary_cors) |>
arrange(phi, var1, var2)
kable(binary_cor_df, caption = 'Correlation of Binary Variables', format = 'pipe', col.names = c('1st Variable', '2nd Variable', 'Phi Coefficient'), align = 'llc')
1st Variable | 2nd Variable | Phi Coefficient |
---|---|---|
housing | y | -0.1391727 |
loan | y | -0.0681850 |
default | y | -0.0224190 |
default | housing | -0.0060252 |
housing | loan | 0.0413229 |
default | loan | 0.0772342 |
There are no noteworthy correlations between the binary variables.
There is a moderate correlation between pdays
and
previous
, but no other correlations stronger than .2
between the numeric variables.
# Create a correlation matrix for numeric variables
numeric_bank_data <- bank_data |>
select(age, balance, day, duration, campaign, pdays, previous)
print(cor(numeric_bank_data))
## age balance day duration campaign
## age 1.000000000 0.097782739 -0.009120046 -0.004648428 0.004760312
## balance 0.097782739 1.000000000 0.004502585 0.021560380 -0.014578279
## day -0.009120046 0.004502585 1.000000000 -0.030206341 0.162490216
## duration -0.004648428 0.021560380 -0.030206341 1.000000000 -0.084569503
## campaign 0.004760312 -0.014578279 0.162490216 -0.084569503 1.000000000
## pdays -0.023758014 0.003435322 -0.093044074 -0.001564770 -0.088627668
## previous 0.001288319 0.016673637 -0.051710497 0.001203057 -0.032855290
## pdays previous
## age -0.023758014 0.001288319
## balance 0.003435322 0.016673637
## day -0.093044074 -0.051710497
## duration -0.001564770 0.001203057
## campaign -0.088627668 -0.032855290
## pdays 1.000000000 0.454819635
## previous 0.454819635 1.000000000
However, the pdays
variable uses a -1 for clients who
were not previously contacted. Removing clients who had not been
previously contacted from the data set results in effectively 0
correlation between the pdays
and previous
variables, indicating that the apparent correlation was the result of
customers who had never been contacted always having -1 for pdays and 0
for previous. Besides this obvious overlap, there is no correlation
between the variables.
# Check correlation between pdays and previous for previously contacted customers only
previously_contacted_customers <- numeric_bank_data |>
filter(pdays != -1) |>
select(pdays, previous)
print(cor(previously_contacted_customers))
## pdays previous
## pdays 1.00000000 -0.02188487
## previous -0.02188487 1.00000000
Some jobs have noticeably higher average balances than others.
# Compare average balance within job categories
avg_balance_by_job <- bank_data |>
select(job, balance) |>
group_by(job) |>
mutate(avg_balance = mean(balance)) |>
ungroup() |>
select(job, avg_balance) |>
distinct() |>
arrange(desc(avg_balance))
avg_balance_by_job |> ggplot(aes(x = job, y = avg_balance)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Average balance increases as age increases.
# Compare average balance by age
avg_balance_by_age <- bank_data |>
select(age, balance) |>
group_by(age) |>
mutate(avg_balance = mean(balance)) |>
ungroup() |>
select(age, avg_balance) |>
distinct() |>
arrange(age)
avg_balance_by_age |> ggplot(aes(x = age, y = avg_balance)) + geom_line()
There is an odd spike in the graph. This is further explored below.
# Locate the spike in the data
print(
avg_balance_by_age <- avg_balance_by_age |>
arrange(desc(avg_balance))
)
## # A tibble: 77 × 2
## age avg_balance
## <int> <dbl>
## 1 84 19511.
## 2 85 4603.
## 3 75 4057.
## 4 74 4026.
## 5 71 3462.
## 6 70 3395.
## 7 68 3196.
## 8 61 2758.
## 9 77 2600.
## 10 72 2541.
## # ℹ 67 more rows
The spike occurs at age 84.
# Examine the balance of all observations with age 84
print(bank_data |> select(age, balance) |> filter(age == 84) |> arrange(desc(balance)))
## age balance
## 1 84 81204
## 2 84 81204
## 3 84 4761
## 4 84 2619
## 5 84 1791
## 6 84 1702
## 7 84 1680
## 8 84 639
## 9 84 0
There is a suspicious duplication of the largest balance. To see if this is the same person contacted, consider all variables for these two observations.
# Compare all variables
print(bank_data |> filter(age == 84) |> filter(balance == 81204) |> arrange(desc(balance)))
## age job marital education default balance housing loan contact day
## 1 84 retired married secondary 0 81204 0 0 telephone 28
## 2 84 retired married secondary 0 81204 0 0 telephone 1
## month duration campaign pdays previous poutcome y
## 1 dec 679 1 313 2 other 1
## 2 apr 390 1 94 3 success 1
The identifying information are identical. The differences suggest that this is the same respondent contacted twice during the campaign.
# Visualize the percent of respondents who are retired by age
retired_pct_by_age <- bank_data |>
select(age, job) |>
group_by(age) |>
mutate(num_retired = sum(job == 'retired')) |>
mutate(total = n()) |>
ungroup() |>
mutate(pct_retired = num_retired / total * 100) |>
select(age, pct_retired) |>
distinct()
retired_pct_by_age |> ggplot(aes(x = age, y = pct_retired)) + geom_line()
As expected, the percentage of respondents at each age who are retired is almst 0 until age 50, and then increases sharply.
# Visualize the percent of respondents who are students by age
student_pct_by_age <- bank_data |>
select(age, job) |>
group_by(age) |>
mutate(num_student = sum(job == 'student')) |>
mutate(total = n()) |>
ungroup() |>
mutate(pct_student = num_student / total * 100) |>
select(age, pct_student) |>
distinct()
student_pct_by_age |> ggplot(aes(x = age, y = pct_student)) + geom_line()
Similarly, the percentage of respondents are each age who are students starts high and then falls off to almost 0 by age 35.