Exploratory Data Analysis

Data Acquisition

# Import required libraries and data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
## Warning: package 'knitr' was built under R version 4.3.3
url <- 'https://raw.githubusercontent.com/Marley-Myrianthopoulos/grad_school_data/refs/heads/main/bank-full.csv'
bank_data <- read.csv(url, sep = ';')
print(head(bank_data))
##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no unknown   5
## 2  44   technician  single secondary      no      29     yes   no unknown   5
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
## 5  33      unknown  single   unknown      no       1      no   no unknown   5
## 6  35   management married  tertiary      no     231     yes   no unknown   5
##   month duration campaign pdays previous poutcome  y
## 1   may      261        1    -1        0  unknown no
## 2   may      151        1    -1        0  unknown no
## 3   may       76        1    -1        0  unknown no
## 4   may       92        1    -1        0  unknown no
## 5   may      198        1    -1        0  unknown no
## 6   may      139        1    -1        0  unknown no

Data Summary

glimpse(bank_data)
## Rows: 45,211
## Columns: 17
## $ age       <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job       <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital   <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance   <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing   <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan      <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact   <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day       <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month     <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration  <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays     <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome  <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …

The data contains 45,211 observations of 17 variables. The variables include two different types of data:

  • Integers are used for the age, balance, day, duration, campaign, pdays, and previous variables.
  • Characters are used for the job, marital, education, default, housing, loan, contact, month, poutcome, and y variables.
glimpse(bank_data |> distinct())
## Rows: 45,211
## Columns: 17
## $ age       <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job       <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital   <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance   <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing   <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan      <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact   <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day       <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month     <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration  <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays     <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome  <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …

The distinct data frame has the same number of rows as the original data frame, so there are no duplicate rows.

Missing Values

There are no NAs in the data set.

print(sum(is.na(bank_data)))
## [1] 0

However, many of the values are entered as “unknown”. The distribution is shown below.

# Determine number of "unknown" values in each column

num_unknown <- c()
for (i in 1:ncol(bank_data)) {
  num_unknown[i] = sum(bank_data[,i] == 'unknown')
}

unknown_df <- data.frame(
  variable = colnames(bank_data),
  num_unknown = num_unknown) |>
  filter(num_unknown != 0) |>
  arrange(num_unknown) |>
  mutate(pct_unknown = round(num_unknown / nrow(bank_data) * 100, 2))

kable(unknown_df, caption = 'Unknown Values in Bank Data', format = 'pipe', col.names = c('Variable', 'Count', 'Percent'), align = 'lcc')
Unknown Values in Bank Data
Variable Count Percent
job 288 0.64
education 1857 4.11
contact 13020 28.80
poutcome 36959 81.75
# Visualize the number of "unknown" values in each column

unknown_df |> ggplot(aes(x = reorder(variable, -num_unknown), y = num_unknown)) + geom_col()

Distribution of Values

Binary Variables

The default, housing, loan, and y columns are binary variables recorded as either “yes” or “no”.

print(unique(bank_data$default))
## [1] "no"  "yes"
print(unique(bank_data$housing))
## [1] "yes" "no"
print(unique(bank_data$loan))
## [1] "no"  "yes"
print(unique(bank_data$y))
## [1] "no"  "yes"

The distribution of these variables is shown below.

# Determine distribution of binary variables

binary_vars <- bank_data |>
  select(c(default, housing, loan, y)) |>
  pivot_longer(cols = c(default, housing, loan, y),
               names_to = 'variable',
               values_to = 'value') |>
  group_by(variable, value) |>
  mutate(count = n()) |>
  ungroup() |>
  distinct() |>
  arrange(variable, value) |>
  group_by(variable) |>
  mutate(pct = round(count / sum(count) * 100, 2)) |>
  ungroup()

kable(binary_vars, caption = 'Distribution of Binary Variables', format = 'pipe', col.names = c('Variable', 'Value', 'Count', 'Percent'), align = 'lccc')
Distribution of Binary Variables
Variable Value Count Percent
default no 44396 98.20
default yes 815 1.80
housing no 20081 44.42
housing yes 25130 55.58
loan no 37967 83.98
loan yes 7244 16.02
y no 39922 88.30
y yes 5289 11.70
# Visualize the distribution of binary variables

binary_vars |> ggplot(aes(x = variable, y = count, fill = value)) + geom_col()

Non-Binary Categorical Variables

The job, marital, education, contact, month, and poutcome variables are non-binary categorical variables. The distribution of each variable is shown below.

# Visualize the distribution of categorical variables

bank_data |> ggplot(aes(x = job)) + geom_bar() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

bank_data |> ggplot(aes(x = marital)) + geom_bar()

bank_data |> ggplot(aes(x = education)) + geom_bar()

bank_data |> ggplot(aes(x = contact)) + geom_bar()

bank_data |> mutate(month = fct_relevel(month, 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec')) |> ggplot(aes(x = month)) + geom_bar()

bank_data |> ggplot(aes(x = poutcome)) + geom_bar()

None of these variables are evenly distributed.

Numeric Variables

# Visualize the distribution of numeric variables

numeric_bank_data <- bank_data |> select(age, balance, day, duration, campaign, pdays, previous)

numeric_bank_data |>
  ggplot(aes(x = age)) +
  geom_bar()

numeric_bank_data |>
  ggplot(aes(x = balance)) +
  geom_histogram(binwidth = 100)

numeric_bank_data |>
  ggplot(aes(x = day)) +
  geom_bar()

numeric_bank_data |>
  ggplot(aes(x = duration)) +
  geom_bar()

numeric_bank_data |>
  ggplot(aes(x = campaign)) +
  geom_bar()

numeric_bank_data |>
  ggplot(aes(x = pdays)) +
  geom_histogram(binwidth = 7)

numeric_bank_data |>
  ggplot(aes(x = previous)) +
  geom_bar()

Outliers

# Create boxplots of numeric variables

numeric_bank_data |>
  ggplot(aes(x = age)) +
  geom_boxplot()

numeric_bank_data |>
  ggplot(aes(x = balance)) +
  geom_boxplot()

numeric_bank_data |>
  ggplot(aes(x = day)) +
  geom_boxplot()

numeric_bank_data |>
  ggplot(aes(x = duration)) +
  geom_boxplot()

numeric_bank_data |>
  ggplot(aes(x = campaign)) +
  geom_boxplot()

numeric_bank_data |>
  ggplot(aes(x = pdays)) +
  geom_boxplot()

numeric_bank_data |>
  ggplot(aes(x = previous)) +
  geom_boxplot()

There are significant outliers in the balance, duration, and previous variables. These are explored below.

# Identify significant outliers in each variable

print(head(bank_data |> select(previous) |> arrange(desc(previous))))
##   previous
## 1      275
## 2       58
## 3       55
## 4       51
## 5       41
## 6       40
print(head(bank_data |> select(duration) |> arrange(desc(duration))))
##   duration
## 1     4918
## 2     3881
## 3     3785
## 4     3422
## 5     3366
## 6     3322
print(head(bank_data |> select(balance) |> arrange(desc(balance))))
##   balance
## 1  102127
## 2   98417
## 3   81204
## 4   81204
## 5   71188
## 6   66721

Correlations Between Variables

Binary Variables

# Check each combination of binary variables for correlation

bank_data$default = as.numeric(bank_data$default == "yes")
bank_data$housing = as.numeric(bank_data$housing == "yes")
bank_data$loan = as.numeric(bank_data$loan == "yes")
bank_data$y = as.numeric(bank_data$y == "yes")

binary_vars_1 <- c('default', 'default', 'default', 'housing', 'housing', 'loan')
binary_vars_2 <- c('housing', 'loan', 'y', 'loan', 'y', 'y')
binary_cors <- c(cor(bank_data$default, bank_data$housing, method = "pearson"), cor(bank_data$default, bank_data$loan, method = "pearson"), cor(bank_data$default, bank_data$y, method = "pearson"), cor(bank_data$housing, bank_data$loan, method = "pearson"), cor(bank_data$housing, bank_data$y, method = "pearson"), cor(bank_data$loan, bank_data$y, method = "pearson"))

binary_cor_df <- data.frame(var1 = binary_vars_1,
                            var2 = binary_vars_2,
                            phi = binary_cors) |>
  arrange(phi, var1, var2)

kable(binary_cor_df, caption = 'Correlation of Binary Variables', format = 'pipe', col.names = c('1st Variable', '2nd Variable', 'Phi Coefficient'), align = 'llc')
Correlation of Binary Variables
1st Variable 2nd Variable Phi Coefficient
housing y -0.1391727
loan y -0.0681850
default y -0.0224190
default housing -0.0060252
housing loan 0.0413229
default loan 0.0772342

There are no noteworthy correlations between the binary variables.

Numeric Variables

There is a moderate correlation between pdays and previous, but no other correlations stronger than .2 between the numeric variables.

# Create a correlation matrix for numeric variables

numeric_bank_data <- bank_data |>
  select(age, balance, day, duration, campaign, pdays, previous)

print(cor(numeric_bank_data))
##                   age      balance          day     duration     campaign
## age       1.000000000  0.097782739 -0.009120046 -0.004648428  0.004760312
## balance   0.097782739  1.000000000  0.004502585  0.021560380 -0.014578279
## day      -0.009120046  0.004502585  1.000000000 -0.030206341  0.162490216
## duration -0.004648428  0.021560380 -0.030206341  1.000000000 -0.084569503
## campaign  0.004760312 -0.014578279  0.162490216 -0.084569503  1.000000000
## pdays    -0.023758014  0.003435322 -0.093044074 -0.001564770 -0.088627668
## previous  0.001288319  0.016673637 -0.051710497  0.001203057 -0.032855290
##                 pdays     previous
## age      -0.023758014  0.001288319
## balance   0.003435322  0.016673637
## day      -0.093044074 -0.051710497
## duration -0.001564770  0.001203057
## campaign -0.088627668 -0.032855290
## pdays     1.000000000  0.454819635
## previous  0.454819635  1.000000000

However, the pdays variable uses a -1 for clients who were not previously contacted. Removing clients who had not been previously contacted from the data set results in effectively 0 correlation between the pdays and previous variables, indicating that the apparent correlation was the result of customers who had never been contacted always having -1 for pdays and 0 for previous. Besides this obvious overlap, there is no correlation between the variables.

# Check correlation between pdays and previous for previously contacted customers only

previously_contacted_customers <- numeric_bank_data |>
  filter(pdays != -1) |>
  select(pdays, previous)

print(cor(previously_contacted_customers))
##                pdays    previous
## pdays     1.00000000 -0.02188487
## previous -0.02188487  1.00000000

Connected Features

Some jobs have noticeably higher average balances than others.

# Compare average balance within job categories

avg_balance_by_job <- bank_data |>
  select(job, balance) |>
  group_by(job) |>
  mutate(avg_balance = mean(balance)) |>
  ungroup() |>
  select(job, avg_balance) |>
  distinct() |>
  arrange(desc(avg_balance))

avg_balance_by_job |> ggplot(aes(x = job, y = avg_balance)) + geom_col() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Average balance increases as age increases.

# Compare average balance by age

avg_balance_by_age <- bank_data |>
  select(age, balance) |>
  group_by(age) |>
  mutate(avg_balance = mean(balance)) |>
  ungroup() |>
  select(age, avg_balance) |>
  distinct() |>
  arrange(age)

avg_balance_by_age |> ggplot(aes(x = age, y = avg_balance)) + geom_line()

There is an odd spike in the graph. This is further explored below.

# Locate the spike in the data

print(
  avg_balance_by_age <- avg_balance_by_age |>
    arrange(desc(avg_balance))
)
## # A tibble: 77 × 2
##      age avg_balance
##    <int>       <dbl>
##  1    84      19511.
##  2    85       4603.
##  3    75       4057.
##  4    74       4026.
##  5    71       3462.
##  6    70       3395.
##  7    68       3196.
##  8    61       2758.
##  9    77       2600.
## 10    72       2541.
## # ℹ 67 more rows

The spike occurs at age 84.

# Examine the balance of all observations with age 84

print(bank_data |> select(age, balance) |> filter(age == 84) |> arrange(desc(balance)))
##   age balance
## 1  84   81204
## 2  84   81204
## 3  84    4761
## 4  84    2619
## 5  84    1791
## 6  84    1702
## 7  84    1680
## 8  84     639
## 9  84       0

There is a suspicious duplication of the largest balance. To see if this is the same person contacted, consider all variables for these two observations.

# Compare all variables

print(bank_data |> filter(age == 84) |> filter(balance == 81204) |> arrange(desc(balance)))
##   age     job marital education default balance housing loan   contact day
## 1  84 retired married secondary       0   81204       0    0 telephone  28
## 2  84 retired married secondary       0   81204       0    0 telephone   1
##   month duration campaign pdays previous poutcome y
## 1   dec      679        1   313        2    other 1
## 2   apr      390        1    94        3  success 1

The identifying information are identical. The differences suggest that this is the same respondent contacted twice during the campaign.

# Visualize the percent of respondents who are retired by age

retired_pct_by_age <- bank_data |>
  select(age, job) |>
  group_by(age) |>
  mutate(num_retired = sum(job == 'retired')) |>
  mutate(total = n()) |>
  ungroup() |>
  mutate(pct_retired = num_retired / total * 100) |>
  select(age, pct_retired) |>
  distinct()

retired_pct_by_age |> ggplot(aes(x = age, y = pct_retired)) + geom_line()

As expected, the percentage of respondents at each age who are retired is almst 0 until age 50, and then increases sharply.

# Visualize the percent of respondents who are students by age

student_pct_by_age <- bank_data |>
  select(age, job) |>
  group_by(age) |>
  mutate(num_student = sum(job == 'student')) |>
  mutate(total = n()) |>
  ungroup() |>
  mutate(pct_student = num_student / total * 100) |>
  select(age, pct_student) |>
  distinct()

student_pct_by_age |> ggplot(aes(x = age, y = pct_student)) + geom_line()

Similarly, the percentage of respondents are each age who are students starts high and then falls off to almost 0 by age 35.