Assignment 1

Daniel DeBonis

Background

A Portuguese bank conducted a marketing campaign (phone calls) to predict if a client will subscribe to a term deposit The records of their efforts are available in the form of a dataset. The objective here is to apply machine learning techniques to analyze the dataset and figure out most effective tactics that will help the bank in next campaign to persuade more customers to subscribe to the bank’s term deposit.

Exploratory Data Analysis

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv('https://raw.githubusercontent.com/ddebonis47/classwork/refs/heads/main/bank-full.csv', sep = ';', stringsAsFactors = TRUE)
str(data)
## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
summary(data)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 6247   Median : 180.0   Median : 2.000  
##  Mean   :15.81   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##                  (Other): 6060                                    
##      pdays          previous           poutcome       y        
##  Min.   : -1.0   Min.   :  0.0000   failure: 4901   no :39922  
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840   yes: 5289  
##  Median : -1.0   Median :  0.0000   success: 1511              
##  Mean   : 40.2   Mean   :  0.5803   unknown:36959              
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                              
##  Max.   :871.0   Max.   :275.0000                              
## 

Visualizing the Distributions

numeric_vars <- data|>
  select(where(is.numeric))

# Histograms
numeric_vars %>% 
  gather(key="variable", value="value") %>%
  ggplot(aes(x=value)) +
  geom_histogram(bins=30, fill="skyblue", color="black") +
  facet_wrap(~variable, scales="free") +
  theme_minimal()

cat_vars <- names(data)[sapply(data, is.factor)]

for (v in cat_vars) {
  df <- data %>%
    count(.data[[v]]) %>%
    mutate(percent = n / sum(n) * 100)
  
  print(
    ggplot(df, aes(x=.data[[v]], y=percent)) +
      geom_col(fill="steelblue") +
      theme_minimal() +
      labs(y="Percent", x=v) +
      theme(axis.text.x = element_text(angle=45, hjust=1))
  )
}

# need to scale to adjust for different ranges
numeric_scaled <- scale(numeric_vars)
numeric_scaled <- as.data.frame(numeric_scaled)
numeric_scaled_long <- numeric_scaled |>
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value")

ggplot(numeric_scaled_long, aes(x=variable, y=value)) +
  geom_boxplot(fill="lightgreen") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=45, hjust=1))

Data Cleaning

median_prev <- median(data$previous, na.rm = TRUE)
data$previous <- ifelse(data$previous == 275, median_prev, data$previous)
summary(data)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 6247   Median : 180.0   Median : 2.000  
##  Mean   :15.81   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##                  (Other): 6060                                    
##      pdays          previous          poutcome       y        
##  Min.   : -1.0   Min.   : 0.0000   failure: 4901   no :39922  
##  1st Qu.: -1.0   1st Qu.: 0.0000   other  : 1840   yes: 5289  
##  Median : -1.0   Median : 0.0000   success: 1511              
##  Mean   : 40.2   Mean   : 0.5742   unknown:36959              
##  3rd Qu.: -1.0   3rd Qu.: 0.0000                              
##  Max.   :871.0   Max.   :58.0000                              
## 

Unknown vs NA

sapply(data, function(x) sum(is.na(x) | x=="unknown"))
##       age       job   marital education   default   balance   housing      loan 
##         0       288         0      1857         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##     13020         0         0         0         0         0         0     36959 
##         y 
##         0
data <- data |>
  mutate(pdays=na_if(pdays, -1))
summary(data)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 6247   Median : 180.0   Median : 2.000  
##  Mean   :15.81   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##                  (Other): 6060                                    
##      pdays          previous          poutcome       y        
##  Min.   :  1.0   Min.   : 0.0000   failure: 4901   no :39922  
##  1st Qu.:133.0   1st Qu.: 0.0000   other  : 1840   yes: 5289  
##  Median :194.0   Median : 0.0000   success: 1511              
##  Mean   :224.6   Mean   : 0.5742   unknown:36959              
##  3rd Qu.:327.0   3rd Qu.: 0.0000                              
##  Max.   :871.0   Max.   :58.0000                              
##  NA's   :36954

Search for More Outliers

numeric_vars <- data|>
  select(where(is.numeric))
numeric_scaled <- scale(numeric_vars)
numeric_scaled <- as.data.frame(numeric_scaled)
numeric_scaled_long <- numeric_scaled |>
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value")

ggplot(numeric_scaled_long, aes(x=variable, y=value)) +
  geom_boxplot(fill="lightgreen") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=45, hjust=1))
## Warning: Removed 36954 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

for (v in names(numeric_vars)) {
  print(
    ggplot(data, aes_string(x="y", y=v)) +
      geom_boxplot(fill="lightgreen") +
      theme_minimal()
  )
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 36954 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

for (v in cat_vars) {
  df <- data %>%
    group_by(across(all_of(c(v, "y")))) %>%   # group by v and y
    summarise(n = n(), .groups = "drop") %>%
    group_by(across(all_of(v))) %>%           # group by v only for percent
    mutate(percent = n / sum(n) * 100)
  
  print(df)
  
  print(
    ggplot(df, aes(x = .data[[v]], y = percent, fill = y)) +
      geom_col() +
      theme_minimal() +
      labs(y = "Percent", x = v) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
  )
}
## # A tibble: 24 × 4
## # Groups:   job [12]
##    job          y         n percent
##    <fct>        <fct> <int>   <dbl>
##  1 admin.       no     4540   87.8 
##  2 admin.       yes     631   12.2 
##  3 blue-collar  no     9024   92.7 
##  4 blue-collar  yes     708    7.27
##  5 entrepreneur no     1364   91.7 
##  6 entrepreneur yes     123    8.27
##  7 housemaid    no     1131   91.2 
##  8 housemaid    yes     109    8.79
##  9 management   no     8157   86.2 
## 10 management   yes    1301   13.8 
## # ℹ 14 more rows

## # A tibble: 6 × 4
## # Groups:   marital [3]
##   marital  y         n percent
##   <fct>    <fct> <int>   <dbl>
## 1 divorced no     4585    88.1
## 2 divorced yes     622    11.9
## 3 married  no    24459    89.9
## 4 married  yes    2755    10.1
## 5 single   no    10878    85.1
## 6 single   yes    1912    14.9

## # A tibble: 8 × 4
## # Groups:   education [4]
##   education y         n percent
##   <fct>     <fct> <int>   <dbl>
## 1 primary   no     6260   91.4 
## 2 primary   yes     591    8.63
## 3 secondary no    20752   89.4 
## 4 secondary yes    2450   10.6 
## 5 tertiary  no    11305   85.0 
## 6 tertiary  yes    1996   15.0 
## 7 unknown   no     1605   86.4 
## 8 unknown   yes     252   13.6

## # A tibble: 4 × 4
## # Groups:   default [2]
##   default y         n percent
##   <fct>   <fct> <int>   <dbl>
## 1 no      no    39159   88.2 
## 2 no      yes    5237   11.8 
## 3 yes     no      763   93.6 
## 4 yes     yes      52    6.38

## # A tibble: 4 × 4
## # Groups:   housing [2]
##   housing y         n percent
##   <fct>   <fct> <int>   <dbl>
## 1 no      no    16727   83.3 
## 2 no      yes    3354   16.7 
## 3 yes     no    23195   92.3 
## 4 yes     yes    1935    7.70

## # A tibble: 4 × 4
## # Groups:   loan [2]
##   loan  y         n percent
##   <fct> <fct> <int>   <dbl>
## 1 no    no    33162   87.3 
## 2 no    yes    4805   12.7 
## 3 yes   no     6760   93.3 
## 4 yes   yes     484    6.68

## # A tibble: 6 × 4
## # Groups:   contact [3]
##   contact   y         n percent
##   <fct>     <fct> <int>   <dbl>
## 1 cellular  no    24916   85.1 
## 2 cellular  yes    4369   14.9 
## 3 telephone no     2516   86.6 
## 4 telephone yes     390   13.4 
## 5 unknown   no    12490   95.9 
## 6 unknown   yes     530    4.07

## # A tibble: 24 × 4
## # Groups:   month [12]
##    month y         n percent
##    <fct> <fct> <int>   <dbl>
##  1 apr   no     2355    80.3
##  2 apr   yes     577    19.7
##  3 aug   no     5559    89.0
##  4 aug   yes     688    11.0
##  5 dec   no      114    53.3
##  6 dec   yes     100    46.7
##  7 feb   no     2208    83.4
##  8 feb   yes     441    16.6
##  9 jan   no     1261    89.9
## 10 jan   yes     142    10.1
## # ℹ 14 more rows

## # A tibble: 8 × 4
## # Groups:   poutcome [4]
##   poutcome y         n percent
##   <fct>    <fct> <int>   <dbl>
## 1 failure  no     4283   87.4 
## 2 failure  yes     618   12.6 
## 3 other    no     1533   83.3 
## 4 other    yes     307   16.7 
## 5 success  no      533   35.3 
## 6 success  yes     978   64.7 
## 7 unknown  no    33573   90.8 
## 8 unknown  yes    3386    9.16

## # A tibble: 2 × 3
## # Groups:   y [2]
##   y         n percent
##   <fct> <int>   <dbl>
## 1 no    39922     100
## 2 yes    5289     100

Correlations and Associations

# Using dlookr to get correlation/association
library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
##   method          from  
##   plot.transform  scales
##   print.transform scales
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:base':
## 
##     transform
library(corrplot)
## corrplot 0.95 loaded
# Correlation / association
assoc_matrix <- correlate(data)  
print(assoc_matrix)
## # A tibble: 42 × 3
##    var1     var2    coef_corr
##    <fct>    <fct>       <dbl>
##  1 balance  age       0.0978 
##  2 day      age      -0.00912
##  3 duration age      -0.00465
##  4 campaign age       0.00476
##  5 pdays    age      -0.108  
##  6 previous age       0.00184
##  7 age      balance   0.0978 
##  8 day      balance   0.00450
##  9 duration balance   0.0216 
## 10 campaign balance  -0.0146 
## # ℹ 32 more rows
# Or for numeric-only correlation
num_cor <- cor(numeric_vars, use="pairwise.complete.obs")
corrplot(num_cor, method="color", addCoef.col="black")

data$y_num <- ifelse(data$y == "yes", 1, 0)
numeric_vars <- data |>
  select(where(is.numeric))
cor_matrix <- cor(numeric_vars, data$y_num, use = "pairwise.complete.obs")
cor_matrix
##                 [,1]
## age       0.02515502
## balance   0.05283841
## day      -0.02834778
## duration  0.39452102
## campaign -0.07317201
## pdays    -0.15220590
## previous  0.11372511
## y_num     1.00000000
library(DescTools)
cat_vars <- names(data)[sapply(data, is.factor)]

safeCramerV <- function(x, y) {
  complete <- complete.cases(x, y)
  x <- x[complete]
  y <- y[complete]
  
  if (length(x) == 0 || length(y) == 0) {
    return(NA)  # nothing to compute
  }
  
  return(CramerV(x, y))
}
assoc_y <- sapply(cat_vars, function(v){
  safeCramerV(data[[v]], data$y)
})

assoc_y
##        job    marital  education    default    housing       loan    contact 
## 0.13599047 0.06592570 0.07269548 0.02241897 0.13917270 0.06818503 0.15135540 
##      month   poutcome          y 
## 0.26023704 0.31166262 1.00000000
pairwise_assoc <- function(df) {
  vars <- names(df)
  result <- data.frame(var1=character(), var2=character(), association=numeric(), stringsAsFactors=FALSE)
  
  for(i in seq_along(vars)) {
    for(j in seq_along(vars)) {
      v1 <- df[[vars[i]]]
      v2 <- df[[vars[j]]]
      
      # Both numeric → Pearson
      if(is.numeric(v1) & is.numeric(v2)) {
        assoc <- cor(v1, v2, use="pairwise.complete.obs")
      }
      # Both categorical → Cramer’s V
      else if(is.factor(v1) & is.factor(v2)) {
        assoc <- CramerV(v1, v2)
      }
      # One numeric, one categorical → point-biserial (numeric vs binary)
      else if(is.numeric(v1) & is.factor(v2) & length(levels(v2))==2) {
        assoc <- cor(v1, as.numeric(v2)-1, use="pairwise.complete.obs")
      }
      else if(is.factor(v1) & length(levels(v1))==2 & is.numeric(v2)) {
        assoc <- cor(as.numeric(v1)-1, v2, use="pairwise.complete.obs")
      }
      # Otherwise → NA (e.g., categorical with >2 levels vs numeric)
      else {
        assoc <- NA
      }
      
      result <- rbind(result, data.frame(var1=vars[i], var2=vars[j], association=assoc))
    }
  }
  
  return(result)
}
assoc_table <- pairwise_assoc(data)
head(assoc_table, 20)  # preview first 20 rows
##    var1      var2  association
## 1   age       age  1.000000000
## 2   age       job           NA
## 3   age   marital           NA
## 4   age education           NA
## 5   age   default -0.017879304
## 6   age   balance  0.097782739
## 7   age   housing -0.185513082
## 8   age      loan -0.015655273
## 9   age   contact           NA
## 10  age       day -0.009120046
## 11  age     month           NA
## 12  age  duration -0.004648428
## 13  age  campaign  0.004760312
## 14  age     pdays -0.107862882
## 15  age  previous  0.001836490
## 16  age  poutcome           NA
## 17  age         y  0.025155017
## 18  age     y_num  0.025155017
## 19  job       age           NA
## 20  job       job  1.000000000
assoc_table %>%
  filter(!is.na(association)) %>%
  arrange(desc(abs(association)))
##          var1      var2   association
## 1         age       age  1.0000000000
## 2         job       job  1.0000000000
## 3     marital   marital  1.0000000000
## 4   education education  1.0000000000
## 5     default   default  1.0000000000
## 6     balance   balance  1.0000000000
## 7     housing   housing  1.0000000000
## 8        loan      loan  1.0000000000
## 9     contact   contact  1.0000000000
## 10        day       day  1.0000000000
## 11      month     month  1.0000000000
## 12   duration  duration  1.0000000000
## 13   campaign  campaign  1.0000000000
## 14      pdays     pdays  1.0000000000
## 15   previous  previous  1.0000000000
## 16   poutcome  poutcome  1.0000000000
## 17          y         y  1.0000000000
## 18          y     y_num  1.0000000000
## 19      y_num         y  1.0000000000
## 20      y_num     y_num  1.0000000000
## 21    contact     month  0.5121267993
## 22      month   contact  0.5121267993
## 23    housing     month  0.5042128413
## 24      month   housing  0.5042128413
## 25        job education  0.4582592303
## 26  education       job  0.4582592303
## 27   duration         y  0.3945210159
## 28   duration     y_num  0.3945210159
## 29          y  duration  0.3945210159
## 30      y_num  duration  0.3945210159
## 31    housing     pdays  0.3351238860
## 32      pdays   housing  0.3351238860
## 33   poutcome         y  0.3116626168
## 34          y  poutcome  0.3116626168
## 35        job   housing  0.2817399221
## 36    housing       job  0.2817399221
## 37      month         y  0.2602370423
## 38          y     month  0.2602370423
## 39      month  poutcome  0.2143362828
## 40   poutcome     month  0.2143362828
## 41    housing   contact  0.2135850795
## 42    contact   housing  0.2135850795
## 43    contact  poutcome  0.2074712311
## 44   poutcome   contact  0.2074712311
## 45        job   marital  0.2060122042
## 46    marital       job  0.2060122042
## 47        age   housing -0.1855130815
## 48    housing       age -0.1855130815
## 49       loan     month  0.1828265550
## 50      month      loan  0.1828265550
## 51        day  campaign  0.1624902163
## 52   campaign       day  0.1624902163
## 53      pdays         y -0.1522058950
## 54      pdays     y_num -0.1522058950
## 55          y     pdays -0.1522058950
## 56      y_num     pdays -0.1522058950
## 57    contact         y  0.1513553979
## 58          y   contact  0.1513553979
## 59        job   contact  0.1504651748
## 60    contact       job  0.1504651748
## 61    housing  poutcome  0.1431471729
## 62   poutcome   housing  0.1431471729
## 63    housing         y  0.1391727025
## 64          y   housing  0.1391727025
## 65    housing     y_num -0.1391727025
## 66      y_num   housing -0.1391727025
## 67        job         y  0.1359904718
## 68          y       job  0.1359904718
## 69  education   contact  0.1227949246
## 70    contact education  0.1227949246
## 71    marital education  0.1216218048
## 72  education   marital  0.1216218048
## 73  education   housing  0.1193392732
## 74    housing education  0.1193392732
## 75   previous         y  0.1137251122
## 76   previous     y_num  0.1137251122
## 77          y  previous  0.1137251122
## 78      y_num  previous  0.1137251122
## 79        job     month  0.1102399554
## 80      month       job  0.1102399554
## 81  education     month  0.1101048520
## 82      month education  0.1101048520
## 83    balance     pdays -0.1081221245
## 84      pdays   balance -0.1081221245
## 85        age     pdays -0.1078628819
## 86      pdays       age -0.1078628819
## 87        job      loan  0.1065016639
## 88       loan       job  0.1065016639
## 89        age   balance  0.0977827394
## 90    balance       age  0.0977827394
## 91        day     pdays -0.0900946441
## 92      pdays       day -0.0900946441
## 93   duration  campaign -0.0845695027
## 94   campaign  duration -0.0845695027
## 95    balance      loan -0.0843502457
## 96       loan   balance -0.0843502457
## 97  education      loan  0.0802788983
## 98       loan education  0.0802788983
## 99    default      loan  0.0772342411
## 100      loan   default  0.0772342411
## 101  campaign         y -0.0731720063
## 102  campaign     y_num -0.0731720063
## 103         y  campaign -0.0731720063
## 104     y_num  campaign -0.0731720063
## 105 education         y  0.0726954758
## 106         y education  0.0726954758
## 107   marital     month  0.0723165992
## 108     month   marital  0.0723165992
## 109   balance   housing -0.0687683157
## 110   housing   balance -0.0687683157
## 111      loan         y  0.0681850347
## 112         y      loan  0.0681850347
## 113      loan     y_num -0.0681850347
## 114     y_num      loan -0.0681850347
## 115   default   balance -0.0667450571
## 116   balance   default -0.0667450571
## 117   marital         y  0.0659256986
## 118         y   marital  0.0659256986
## 119       job  poutcome  0.0642142026
## 120  poutcome       job  0.0642142026
## 121   default     month  0.0586747101
## 122     month   default  0.0586747101
## 123       day  previous -0.0571417686
## 124  previous       day -0.0571417686
## 125      loan  poutcome  0.0552479823
## 126  poutcome      loan  0.0552479823
## 127   balance         y  0.0528384103
## 128   balance     y_num  0.0528384103
## 129         y   balance  0.0528384103
## 130     y_num   balance  0.0528384103
## 131   marital      loan  0.0519365814
## 132      loan   marital  0.0519365814
## 133  campaign     pdays  0.0505336900
## 134     pdays  campaign  0.0505336900
## 135   marital   contact  0.0450906590
## 136   contact   marital  0.0450906590
## 137   housing  previous  0.0419125713
## 138  previous   housing  0.0419125713
## 139   housing      loan  0.0413228660
## 140      loan   housing  0.0413228660
## 141   default  poutcome  0.0404031963
## 142  poutcome   default  0.0404031963
## 143  campaign  previous -0.0388805268
## 144  previous  campaign -0.0388805268
## 145       job   default  0.0365333681
## 146   default       job  0.0365333681
## 147 education  poutcome  0.0356516656
## 148  poutcome education  0.0356516656
## 149   default     pdays  0.0337603184
## 150     pdays   default  0.0337603184
## 151     pdays  previous -0.0321324145
## 152  previous     pdays -0.0321324145
## 153       day  duration -0.0302063411
## 154  duration       day -0.0302063411
## 155   marital  poutcome  0.0290826813
## 156  poutcome   marital  0.0290826813
## 157       day         y -0.0283477767
## 158       day     y_num -0.0283477767
## 159         y       day -0.0283477767
## 160     y_num       day -0.0283477767
## 161   housing       day -0.0279816493
## 162       day   housing -0.0279816493
## 163       age         y  0.0251550171
## 164       age     y_num  0.0251550171
## 165         y       age  0.0251550171
## 166     y_num       age  0.0251550171
## 167  duration     pdays -0.0244065859
## 168     pdays  duration -0.0244065859
## 169   default   contact  0.0244057519
## 170   contact   default  0.0244057519
## 171   housing  campaign -0.0235987068
## 172  campaign   housing -0.0235987068
## 173      loan     pdays  0.0224539965
## 174     pdays      loan  0.0224539965
## 175   default         y  0.0224189659
## 176         y   default  0.0224189659
## 177   default     y_num -0.0224189659
## 178     y_num   default -0.0224189659
## 179   default  previous -0.0216973553
## 180  previous   default -0.0216973553
## 181   balance  duration  0.0215603805
## 182  duration   balance  0.0215603805
## 183   balance  previous  0.0209881426
## 184  previous   balance  0.0209881426
## 185   marital   housing  0.0206852270
## 186   housing   marital  0.0206852270
## 187   marital   default  0.0192303904
## 188   default   marital  0.0192303904
## 189       age   default -0.0178793036
## 190   default       age -0.0178793036
## 191   default  campaign  0.0168215314
## 192  campaign   default  0.0168215314
## 193      loan   contact  0.0162737669
## 194   contact      loan  0.0162737669
## 195 education   default  0.0158963802
## 196   default education  0.0158963802
## 197       age      loan -0.0156552727
## 198      loan       age -0.0156552727
## 199   balance  campaign -0.0145782789
## 200  campaign   balance -0.0145782789
## 201      loan  duration -0.0124119718
## 202  duration      loan -0.0124119718
## 203      loan  previous -0.0119403526
## 204  previous      loan -0.0119403526
## 205      loan       day  0.0113701576
## 206       day      loan  0.0113701576
## 207   default  duration -0.0100214613
## 208  duration   default -0.0100214613
## 209      loan  campaign  0.0099798459
## 210  campaign      loan  0.0099798459
## 211   default       day  0.0094238991
## 212       day   default  0.0094238991
## 213       age       day -0.0091200456
## 214       day       age -0.0091200456
## 215   default   housing  0.0060252184
## 216   housing   default  0.0060252184
## 217   housing  duration  0.0050754494
## 218  duration   housing  0.0050754494
## 219       age  campaign  0.0047603118
## 220  campaign       age  0.0047603118
## 221       age  duration -0.0046484285
## 222  duration       age -0.0046484285
## 223   balance       day  0.0045025851
## 224       day   balance  0.0045025851
## 225       age  previous  0.0018364901
## 226  previous       age  0.0018364901
## 227  duration  previous  0.0003279149
## 228  previous  duration  0.0003279149

Preprocessing

data <- data |>
   mutate(poutcome=na_if(poutcome, "unknown"))
data <- data |>
   mutate(job=na_if(job, "unknown"))
data <- data |>
   mutate(education=na_if(education, "unknown"))
data <- data |>
   mutate(contact=na_if(contact, "unknown"))
library(lubridate)

data$campaign_date <- dmy(paste(data$day, data$month, "2008"))
data$campaign_weekday <- wday(data$campaign_date, label = TRUE)
data$campaign_quarter <- quarter(data$campaign_date)