library(data.table)
library(knitr)
library(tidyr)
library(MASS)
library(tidyverse)
library(caret)
library(pROC)
library(broom)
library(corrplot)
data_path <- "https://raw.githubusercontent.com/kccasey1985/Case-Study-Bank-Marketting-/main/bank-additional-full.csv"
df <- fread(data_path, sep = ";") |> as_tibble()
glimpse(df)
## Rows: 41,188
## Columns: 21
## $ age <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job <chr> "housemaid", "services", "services", "admin.", "service…
## $ marital <chr> "married", "married", "married", "married", "married", …
## $ education <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
## $ default <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
## $ housing <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
## $ loan <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
## $ contact <chr> "telephone", "telephone", "telephone", "telephone", "te…
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
## $ day_of_week <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
## $ duration <int> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
## $ emp.var.rate <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons.price.idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons.conf.idx <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr.employed <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
df2 <- df %>%
dplyr::mutate(job = factor(job),
marital = factor(marital),
education = factor(education),
default = factor(default),
housing = factor(housing),
loan = factor(loan),
contact = factor(contact),
month = factor(month),
day_of_week = factor(day_of_week),
poutcome = factor(poutcome),
y = factor(y, levels = c("no","yes"))) %>%
dplyr::select(-tidyselect::any_of("duration"))
df2 <- df2 %>%
dplyr::mutate(age = cut(age,
breaks = c(20, 30, 40, 50, 60, 70, 100),
labels = c("21-30","31-40","41-50","51-60","61-70","71+"),
right = FALSE))
drop_cols <- c("emp.var.rate","cons.price.idx","euribor3m","nr.employed")
df3 <- df2 %>%
dplyr::select(-tidyselect::any_of(drop_cols))
str(df)
## tibble [41,188 × 21] (S3: tbl_df/tbl/data.frame)
## $ age : int [1:41188] 56 57 37 40 56 45 59 41 24 25 ...
## $ job : chr [1:41188] "housemaid" "services" "services" "admin." ...
## $ marital : chr [1:41188] "married" "married" "married" "married" ...
## $ education : chr [1:41188] "basic.4y" "high.school" "high.school" "basic.6y" ...
## $ default : chr [1:41188] "no" "unknown" "no" "no" ...
## $ housing : chr [1:41188] "no" "no" "yes" "no" ...
## $ loan : chr [1:41188] "no" "no" "no" "no" ...
## $ contact : chr [1:41188] "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr [1:41188] "may" "may" "may" "may" ...
## $ day_of_week : chr [1:41188] "mon" "mon" "mon" "mon" ...
## $ duration : int [1:41188] 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int [1:41188] 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int [1:41188] 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int [1:41188] 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr [1:41188] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num [1:41188] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num [1:41188] 94 94 94 94 94 ...
## $ cons.conf.idx : num [1:41188] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num [1:41188] 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num [1:41188] 5191 5191 5191 5191 5191 ...
## $ y : chr [1:41188] "no" "no" "no" "no" ...
summary(df3)
## age job marital education
## 21-30: 5594 admin. :10422 divorced: 4612 university.degree :12168
## 31-40:16938 blue-collar: 9254 married :24928 high.school : 9515
## 41-50:10526 technician : 6743 single :11568 basic.9y : 6045
## 51-60: 6862 services : 3969 unknown : 80 professional.course: 5243
## 61-70: 724 management : 2924 basic.4y : 4176
## 71+ : 469 retired : 1720 basic.6y : 2292
## NA's : 75 (Other) : 6156 (Other) : 1749
## default housing loan contact
## no :32588 no :18622 no :33950 cellular :26144
## unknown: 8597 unknown: 990 unknown: 990 telephone:15044
## yes : 3 yes :21576 yes : 6248
##
##
##
##
## month day_of_week campaign pdays previous
## may :13769 fri:7827 Min. : 1.000 Min. : 0.0 Min. :0.000
## jul : 7174 mon:8514 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000
## aug : 6178 thu:8623 Median : 2.000 Median :999.0 Median :0.000
## jun : 5318 tue:8090 Mean : 2.568 Mean :962.5 Mean :0.173
## nov : 4101 wed:8134 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## apr : 2632 Max. :56.000 Max. :999.0 Max. :7.000
## (Other): 2016
## poutcome cons.conf.idx y
## failure : 4252 Min. :-50.8 no :36548
## nonexistent:35563 1st Qu.:-42.7 yes: 4640
## success : 1373 Median :-41.8
## Mean :-40.5
## 3rd Qu.:-36.4
## Max. :-26.9
##
df %>%
dplyr::select(tidyselect::any_of(c("emp.var.rate","cons.price.idx","euribor3m","nr.employed"))) %>%
cor(use = "complete.obs") %>%
corrplot::corrplot(method = "number", type = "upper", tl.col = "black")
df3 %>%
count(y) %>%
ggplot(aes(y, n, fill = y)) +
geom_col(width = 0.6) +
geom_text(aes(label = n), vjust = -0.4) +
labs(x = "Response", y = "Count", title = "Target Variable Distribution") +
theme_minimal() +
theme(legend.position = "none")
set.seed(123)
train_idx <- caret::createDataPartition(df3$y, p = 0.7, list = FALSE)
train_unbal <- df3[train_idx, ]
test_unbal <- df3[-train_idx, ]
model_terms <- all.vars(formula)
drop_terms <- function(d) tidyr::drop_na(d, tidyselect::all_of(model_terms))
train_unbal_cc <- drop_terms(train_unbal)
test_unbal_cc <- drop_terms(test_unbal)
train_x <- dplyr::select(train_unbal_cc, -y)
train_y <- train_unbal_cc$y
train_bal_cc <- caret::upSample(x = train_x, y = train_y, yname = "y") %>%
tibble::as_tibble()
formula <- y ~ age + job + marital + education + default + housing + contact + month + day_of_week + poutcome + campaign + pdays + previous + cons.conf.idx
fit_log_unbal <- glm(formula, data = train_unbal_cc, family = binomial)
fit_lda_unbal <- MASS::lda(formula, data = train_unbal_cc)
fit_log_bal <- glm(formula, data = train_bal_cc, family = binomial)
fit_lda_bal <- MASS::lda(formula, data = train_bal_cc)
pred_log_unbal <- predict(fit_log_unbal, newdata = test_unbal_cc, type = "response")
pred_lda_unbal <- predict(fit_lda_unbal, newdata = test_unbal_cc)$posterior[, "yes"]
pred_log_bal <- predict(fit_log_bal, newdata = test_unbal_cc, type = "response")
pred_lda_bal <- predict(fit_lda_bal, newdata = test_unbal_cc)$posterior[, "yes"]
eval_one <- function(truth, prob, label, thr = 0.5) {
roc_obj <- pROC::roc(response = truth, predictor = prob,
levels = c("no","yes"), direction = "<")
pred_cls <- factor(ifelse(prob >= thr, "yes", "no"), levels = c("no","yes"))
cm <- caret::confusionMatrix(pred_cls, truth, positive = "yes")
tibble(model = label,
auc = as.numeric(roc_obj$auc),
accuracy = as.numeric(cm$overall["Accuracy"]),
sensitivity = as.numeric(cm$byClass["Sensitivity"]),
specificity = as.numeric(cm$byClass["Specificity"]))
}
results_final <- bind_rows(eval_one(test_unbal_cc$y, pred_log_unbal, "logistic (train=unbal)"),
eval_one(test_unbal_cc$y, pred_log_bal, "logistic (train=bal)"),
eval_one(test_unbal_cc$y, pred_lda_unbal, "lda (train=unbal)"),
eval_one(test_unbal_cc$y, pred_lda_bal, "lda (train=bal)"))
kable(results_final, digits = 4, caption = "model performance summary (evaluated on unbalanced test)")
model | auc | accuracy | sensitivity | specificity |
---|---|---|---|---|
logistic (train=unbal) | 0.7697 | 0.8962 | 0.1965 | 0.9847 |
logistic (train=bal) | 0.7719 | 0.8208 | 0.5932 | 0.8496 |
lda (train=unbal) | 0.7676 | 0.8927 | 0.2861 | 0.9693 |
lda (train=bal) | 0.7705 | 0.8171 | 0.5867 | 0.8462 |
print_cm <- function(truth, prob, title) {
pred_cls <- factor(ifelse(prob > 0.5, "yes", "no"), levels = c("no","yes"))
cm <- caret::confusionMatrix(pred_cls, truth, positive = "yes")
cat("\n", title, "\n")
print(cm$table)
cat(sprintf("Accuracy=%.3f Sensitivity=%.3f Specificity=%.3f\n",
cm$overall["Accuracy"], cm$byClass["Sensitivity"], cm$byClass["Specificity"]))
}
print_cm(test_unbal_cc$y, pred_log_unbal, "Confusion Matrix: Logistic (Train=Unbalanced)")
##
## Confusion Matrix: Logistic (Train=Unbalanced)
## Reference
## Prediction no yes
## no 10784 1112
## yes 168 272
## Accuracy=0.896 Sensitivity=0.197 Specificity=0.985
print_cm(test_unbal_cc$y, pred_log_bal, "Confusion Matrix: Logistic (Train=Balanced)")
##
## Confusion Matrix: Logistic (Train=Balanced)
## Reference
## Prediction no yes
## no 9305 563
## yes 1647 821
## Accuracy=0.821 Sensitivity=0.593 Specificity=0.850
print_cm(test_unbal_cc$y, pred_lda_unbal, "Confusion Matrix: LDA (Train=Unbalanced)")
##
## Confusion Matrix: LDA (Train=Unbalanced)
## Reference
## Prediction no yes
## no 10616 988
## yes 336 396
## Accuracy=0.893 Sensitivity=0.286 Specificity=0.969
print_cm(test_unbal_cc$y, pred_lda_bal, "Confusion Matrix: LDA (Train=Balanced)")
##
## Confusion Matrix: LDA (Train=Balanced)
## Reference
## Prediction no yes
## no 9268 572
## yes 1684 812
## Accuracy=0.817 Sensitivity=0.587 Specificity=0.846
df3_clean <- df3[complete.cases(df3), ]
logit_model <- glm(y ~ ., data = df3_clean, family = binomial)
logit_probs <- predict(logit_model, newdata = df3_clean, type = "response")
roc_logit <- pROC::roc(df3_clean$y, logit_probs, levels = c("no","yes"), direction = "<")
logit_auc <- as.numeric(roc_logit$auc)
lda_model <- MASS::lda(y ~ ., data = df3_clean)
lda_probs <- predict(lda_model, newdata = df3_clean)$posterior[, "yes"]
roc_lda <- pROC::roc(df3_clean$y, lda_probs, levels = c("no","yes"), direction = "<")
lda_auc <- as.numeric(roc_lda$auc)
plot(roc_logit, col = "black", lwd = 2, main = "ROC Curves: Logistic vs LDA (Unbalanced)")
plot(roc_lda, col = "red", lwd = 2, add = TRUE)
legend("bottomright",
legend = c(sprintf("Logistic (AUC=%.3f)", logit_auc),
sprintf("LDA (AUC=%.3f)", lda_auc)),
col = c("black","red"), lwd = 2)
cat(sprintf("Logistic Regression AUC: %.4f\n", logit_auc))
## Logistic Regression AUC: 0.7698
cat(sprintf("LDA AUC: %.4f\n", lda_auc))
## LDA AUC: 0.7685