Data

Data Cleaning

library(data.table)
library(knitr)
library(tidyr)
library(MASS)   
library(tidyverse)
library(caret)     
library(pROC)      
library(broom)    
library(corrplot)
data_path <- "https://raw.githubusercontent.com/kccasey1985/Case-Study-Bank-Marketting-/main/bank-additional-full.csv"
df <- fread(data_path, sep = ";") |> as_tibble()
glimpse(df)
## Rows: 41,188
## Columns: 21
## $ age            <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job            <chr> "housemaid", "services", "services", "admin.", "service…
## $ marital        <chr> "married", "married", "married", "married", "married", …
## $ education      <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
## $ default        <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
## $ housing        <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
## $ loan           <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
## $ contact        <chr> "telephone", "telephone", "telephone", "telephone", "te…
## $ month          <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
## $ day_of_week    <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
## $ duration       <int> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays          <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome       <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
## $ emp.var.rate   <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons.price.idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons.conf.idx  <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m      <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr.employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y              <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
df2 <- df %>%
  dplyr::mutate(job         = factor(job),
                marital     = factor(marital),
                education   = factor(education),
                default     = factor(default),
                housing     = factor(housing),
                loan        = factor(loan),
                contact     = factor(contact),
                month       = factor(month),
                day_of_week = factor(day_of_week),
                poutcome    = factor(poutcome),
                y           = factor(y, levels = c("no","yes"))) %>%
  dplyr::select(-tidyselect::any_of("duration")) 

df2 <- df2 %>%
  dplyr::mutate(age = cut(age,
                          breaks = c(20, 30, 40, 50, 60, 70, 100),
                          labels = c("21-30","31-40","41-50","51-60","61-70","71+"),
                          right = FALSE))

drop_cols <- c("emp.var.rate","cons.price.idx","euribor3m","nr.employed")
df3 <- df2 %>%
  dplyr::select(-tidyselect::any_of(drop_cols))

View Data

str(df)
## tibble [41,188 × 21] (S3: tbl_df/tbl/data.frame)
##  $ age           : int [1:41188] 56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : chr [1:41188] "housemaid" "services" "services" "admin." ...
##  $ marital       : chr [1:41188] "married" "married" "married" "married" ...
##  $ education     : chr [1:41188] "basic.4y" "high.school" "high.school" "basic.6y" ...
##  $ default       : chr [1:41188] "no" "unknown" "no" "no" ...
##  $ housing       : chr [1:41188] "no" "no" "yes" "no" ...
##  $ loan          : chr [1:41188] "no" "no" "no" "no" ...
##  $ contact       : chr [1:41188] "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr [1:41188] "may" "may" "may" "may" ...
##  $ day_of_week   : chr [1:41188] "mon" "mon" "mon" "mon" ...
##  $ duration      : int [1:41188] 261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int [1:41188] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int [1:41188] 999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int [1:41188] 0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr [1:41188] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num [1:41188] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num [1:41188] 94 94 94 94 94 ...
##  $ cons.conf.idx : num [1:41188] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num [1:41188] 4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num [1:41188] 5191 5191 5191 5191 5191 ...
##  $ y             : chr [1:41188] "no" "no" "no" "no" ...
summary(df3)
##     age                 job            marital                    education    
##  21-30: 5594   admin.     :10422   divorced: 4612   university.degree  :12168  
##  31-40:16938   blue-collar: 9254   married :24928   high.school        : 9515  
##  41-50:10526   technician : 6743   single  :11568   basic.9y           : 6045  
##  51-60: 6862   services   : 3969   unknown :   80   professional.course: 5243  
##  61-70:  724   management : 2924                    basic.4y           : 4176  
##  71+  :  469   retired    : 1720                    basic.6y           : 2292  
##  NA's :   75   (Other)    : 6156                    (Other)            : 1749  
##     default         housing           loan            contact     
##  no     :32588   no     :18622   no     :33950   cellular :26144  
##  unknown: 8597   unknown:  990   unknown:  990   telephone:15044  
##  yes    :    3   yes    :21576   yes    : 6248                    
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##      month       day_of_week    campaign          pdays          previous    
##  may    :13769   fri:7827    Min.   : 1.000   Min.   :  0.0   Min.   :0.000  
##  jul    : 7174   mon:8514    1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.000  
##  aug    : 6178   thu:8623    Median : 2.000   Median :999.0   Median :0.000  
##  jun    : 5318   tue:8090    Mean   : 2.568   Mean   :962.5   Mean   :0.173  
##  nov    : 4101   wed:8134    3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.000  
##  apr    : 2632               Max.   :56.000   Max.   :999.0   Max.   :7.000  
##  (Other): 2016                                                               
##         poutcome     cons.conf.idx     y        
##  failure    : 4252   Min.   :-50.8   no :36548  
##  nonexistent:35563   1st Qu.:-42.7   yes: 4640  
##  success    : 1373   Median :-41.8              
##                      Mean   :-40.5              
##                      3rd Qu.:-36.4              
##                      Max.   :-26.9              
## 

Data Preprocessing

df %>%
  dplyr::select(tidyselect::any_of(c("emp.var.rate","cons.price.idx","euribor3m","nr.employed"))) %>%
  cor(use = "complete.obs") %>%
  corrplot::corrplot(method = "number", type = "upper", tl.col = "black")

Data Limitations

Class Balance

df3 %>%
  count(y) %>%
  ggplot(aes(y, n, fill = y)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = n), vjust = -0.4) +
  labs(x = "Response", y = "Count", title = "Target Variable Distribution") +
  theme_minimal() +
  theme(legend.position = "none")

Data Limitations

Findings

Train/Test Split & Balancing

set.seed(123)
train_idx   <- caret::createDataPartition(df3$y, p = 0.7, list = FALSE)
train_unbal <- df3[train_idx, ]
test_unbal  <- df3[-train_idx, ]

model_terms <- all.vars(formula)
drop_terms  <- function(d) tidyr::drop_na(d, tidyselect::all_of(model_terms))

train_unbal_cc <- drop_terms(train_unbal)
test_unbal_cc  <- drop_terms(test_unbal)
train_x <- dplyr::select(train_unbal_cc, -y)
train_y <- train_unbal_cc$y
train_bal_cc <- caret::upSample(x = train_x, y = train_y, yname = "y") %>%
  tibble::as_tibble()

Fit Models

formula <- y ~ age + job + marital + education + default + housing + contact + month + day_of_week + poutcome + campaign + pdays + previous + cons.conf.idx

fit_log_unbal <- glm(formula, data = train_unbal_cc, family = binomial)
fit_lda_unbal <- MASS::lda(formula, data = train_unbal_cc)

fit_log_bal   <- glm(formula, data = train_bal_cc, family = binomial)
fit_lda_bal   <- MASS::lda(formula, data = train_bal_cc)

Evaluate

pred_log_unbal <- predict(fit_log_unbal, newdata = test_unbal_cc, type = "response")
pred_lda_unbal <- predict(fit_lda_unbal, newdata = test_unbal_cc)$posterior[, "yes"]
pred_log_bal   <- predict(fit_log_bal,   newdata = test_unbal_cc, type = "response")
pred_lda_bal   <- predict(fit_lda_bal,   newdata = test_unbal_cc)$posterior[, "yes"]
eval_one <- function(truth, prob, label, thr = 0.5) {
  roc_obj  <- pROC::roc(response = truth, predictor = prob,
                        levels = c("no","yes"), direction = "<")
  pred_cls <- factor(ifelse(prob >= thr, "yes", "no"), levels = c("no","yes"))
  cm <- caret::confusionMatrix(pred_cls, truth, positive = "yes")
  tibble(model       = label,
         auc         = as.numeric(roc_obj$auc),
         accuracy    = as.numeric(cm$overall["Accuracy"]),
         sensitivity = as.numeric(cm$byClass["Sensitivity"]),
         specificity = as.numeric(cm$byClass["Specificity"]))
}

results_final <- bind_rows(eval_one(test_unbal_cc$y, pred_log_unbal, "logistic (train=unbal)"),
                           eval_one(test_unbal_cc$y, pred_log_bal,   "logistic (train=bal)"),
                           eval_one(test_unbal_cc$y, pred_lda_unbal, "lda (train=unbal)"),
                           eval_one(test_unbal_cc$y, pred_lda_bal,   "lda (train=bal)"))

kable(results_final, digits = 4, caption = "model performance summary (evaluated on unbalanced test)")
model performance summary (evaluated on unbalanced test)
model auc accuracy sensitivity specificity
logistic (train=unbal) 0.7697 0.8962 0.1965 0.9847
logistic (train=bal) 0.7719 0.8208 0.5932 0.8496
lda (train=unbal) 0.7676 0.8927 0.2861 0.9693
lda (train=bal) 0.7705 0.8171 0.5867 0.8462

Confusion Matrices

print_cm <- function(truth, prob, title) {
  pred_cls <- factor(ifelse(prob > 0.5, "yes", "no"), levels = c("no","yes"))
  cm <- caret::confusionMatrix(pred_cls, truth, positive = "yes")
  cat("\n", title, "\n")
  print(cm$table)
  cat(sprintf("Accuracy=%.3f  Sensitivity=%.3f  Specificity=%.3f\n",
              cm$overall["Accuracy"], cm$byClass["Sensitivity"], cm$byClass["Specificity"]))
}

print_cm(test_unbal_cc$y, pred_log_unbal, "Confusion Matrix: Logistic (Train=Unbalanced)")
## 
##  Confusion Matrix: Logistic (Train=Unbalanced) 
##           Reference
## Prediction    no   yes
##        no  10784  1112
##        yes   168   272
## Accuracy=0.896  Sensitivity=0.197  Specificity=0.985
print_cm(test_unbal_cc$y, pred_log_bal,   "Confusion Matrix: Logistic (Train=Balanced)")
## 
##  Confusion Matrix: Logistic (Train=Balanced) 
##           Reference
## Prediction   no  yes
##        no  9305  563
##        yes 1647  821
## Accuracy=0.821  Sensitivity=0.593  Specificity=0.850
print_cm(test_unbal_cc$y, pred_lda_unbal, "Confusion Matrix: LDA (Train=Unbalanced)")
## 
##  Confusion Matrix: LDA (Train=Unbalanced) 
##           Reference
## Prediction    no   yes
##        no  10616   988
##        yes   336   396
## Accuracy=0.893  Sensitivity=0.286  Specificity=0.969
print_cm(test_unbal_cc$y, pred_lda_bal,   "Confusion Matrix: LDA (Train=Balanced)")
## 
##  Confusion Matrix: LDA (Train=Balanced) 
##           Reference
## Prediction   no  yes
##        no  9268  572
##        yes 1684  812
## Accuracy=0.817  Sensitivity=0.587  Specificity=0.846

ROC Curves

df3_clean <- df3[complete.cases(df3), ]

logit_model <- glm(y ~ ., data = df3_clean, family = binomial)
logit_probs <- predict(logit_model, newdata = df3_clean, type = "response")
roc_logit <- pROC::roc(df3_clean$y, logit_probs, levels = c("no","yes"), direction = "<")
logit_auc <- as.numeric(roc_logit$auc)

lda_model <- MASS::lda(y ~ ., data = df3_clean)
lda_probs <- predict(lda_model, newdata = df3_clean)$posterior[, "yes"]
roc_lda <- pROC::roc(df3_clean$y, lda_probs, levels = c("no","yes"), direction = "<")
lda_auc <- as.numeric(roc_lda$auc)

plot(roc_logit, col = "black", lwd = 2, main = "ROC Curves: Logistic vs LDA (Unbalanced)")
plot(roc_lda,   col = "red",   lwd = 2, add = TRUE)
legend("bottomright",
       legend = c(sprintf("Logistic (AUC=%.3f)", logit_auc),
                  sprintf("LDA (AUC=%.3f)", lda_auc)),
       col = c("black","red"), lwd = 2)

cat(sprintf("Logistic Regression AUC: %.4f\n", logit_auc))
## Logistic Regression AUC: 0.7698
cat(sprintf("LDA AUC: %.4f\n", lda_auc))
## LDA AUC: 0.7685