set.seed(42)
options(dplyr.summarise.inform = FALSE)
library(tidyverse)
library(janitor)
library(caret)
library(pROC)
library(rpart)
library(randomForest)
library(adabag)
library(knitr)
theme_set(theme_minimal())
file_path <- "/Users/michaelrobinson/Downloads/bank+marketing/bank-additional/bank-additional-full.csv"
bank <- read.csv(file_path, sep = ";", header = TRUE, stringsAsFactors = FALSE) |>
clean_names() |>
mutate(y = factor(y, levels = c("no","yes"))) |>
dplyr::select(-dplyr::any_of("duration")) |>
mutate(across(where(is.character), as.factor))
glimpse(bank)
## Rows: 41,188
## Columns: 20
## $ age <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job <fct> housemaid, services, services, admin., services, servic…
## $ marital <fct> married, married, married, married, married, married, m…
## $ education <fct> basic.4y, high.school, high.school, basic.6y, high.scho…
## $ default <fct> no, unknown, no, no, no, unknown, no, unknown, no, no, …
## $ housing <fct> no, no, yes, no, no, no, no, no, yes, yes, no, yes, no,…
## $ loan <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes, n…
## $ contact <fct> telephone, telephone, telephone, telephone, telephone, …
## $ month <fct> may, may, may, may, may, may, may, may, may, may, may, …
## $ day_of_week <fct> mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, …
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome <fct> nonexistent, nonexistent, nonexistent, nonexistent, non…
## $ emp_var_rate <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons_price_idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons_conf_idx <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr_employed <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
table(bank$y)
##
## no yes
## 36548 4640
sum(is.na(bank))
## [1] 0
Purpose:
- Creates an 80/20 stratified split (keeps yes/no ratio stable).
- Ensures test factor levels match training (avoids “new levels”
prediction errors).
- If a novel level appears in test, it is mapped to
"unknown" (or the first level if "unknown" is
unavailable).
set.seed(42)
idx <- caret::createDataPartition(bank$y, p = 0.80, list = FALSE)
train <- bank[idx, ]
test <- bank[-idx, ]
factor_cols <- setdiff(names(train)[sapply(train, is.factor)], "y")
for (col in factor_cols) {
allowed <- levels(train[[col]])
if (is.null(allowed)) next
novel_idx <- !(test[[col]] %in% allowed)
if (any(novel_idx, na.rm = TRUE)) {
fallback <- if ("unknown" %in% allowed) "unknown" else allowed[1]
test[[col]][novel_idx] <- fallback
}
test[[col]] <- factor(test[[col]], levels = allowed)
}
Purpose:
- Centralizes metric computation.
- Uses pROC for AUC with the correct positive class.
- Returns counts (TP/FP/FN/TN) for confusion summaries.
- Uses a consistent 0.5 threshold when you pass
pred_class.
metrics <- function(y_true, prob_yes, pred_class){
y_true <- factor(y_true, levels = c("no","yes"))
pred_class <- factor(pred_class, levels = c("no","yes"))
auc <- as.numeric(pROC::auc(pROC::roc(y_true, prob_yes, levels = c("no","yes"))))
cm <- table(truth = y_true, pred = pred_class)
TP <- ifelse("yes" %in% rownames(cm) && "yes" %in% colnames(cm), cm["yes","yes"], 0)
FP <- ifelse("no" %in% rownames(cm) && "yes" %in% colnames(cm), cm["no","yes"], 0)
FN <- ifelse("yes" %in% rownames(cm) && "no" %in% colnames(cm), cm["yes","no"], 0)
TN <- ifelse("no" %in% rownames(cm) && "no" %in% colnames(cm), cm["no","no"], 0)
acc <- (TP + TN) / sum(cm)
prec <- ifelse((TP + FP) > 0, TP / (TP + FP), NA_real_)
rec <- ifelse((TP + FN) > 0, TP / (TP + FN), NA_real_)
data.frame(
Accuracy = acc,
Precision_Yes = prec,
Recall_Yes = rec,
ROC_AUC = auc,
TP = TP, FP = FP, FN = FN, TN = TN,
stringsAsFactors = FALSE
)
}
Purpose: A shallower tree (maxdepth=3) should generalize better (less variance), likely yielding higher precision but lower recall vs deeper trees.
results <- tibble()
ctrl <- rpart.control(maxdepth = 3, cp = 0.0, minsplit = 20)
dt1 <- rpart::rpart(y ~ ., data = train, method = "class", control = ctrl)
prob <- predict(dt1, newdata = test, type = "prob")[,"yes"]
pred <- factor(ifelse(prob >= 0.5, "yes", "no"), levels = c("no","yes"))
res_dt1 <- metrics(test$y, prob, pred) |>
mutate(Algorithm = "Decision Tree", Experiment = "Depth=3")
## Setting direction: controls < cases
res_dt1
results <- bind_rows(results, res_dt1)
Purpose: A deeper tree (maxdepth=8) should increase recall and AUC (lower bias), possibly with some precision loss.
ctrl <- rpart.control(maxdepth = 8, cp = 0.0, minsplit = 20)
dt2 <- rpart::rpart(y ~ ., data = train, method = "class", control = ctrl)
prob <- predict(dt2, newdata = test, type = "prob")[,"yes"]
pred <- factor(ifelse(prob >= 0.5, "yes", "no"), levels = c("no","yes"))
res_dt2 <- metrics(test$y, prob, pred) |>
mutate(Algorithm = "Decision Tree", Experiment = "Depth=8")
## Setting direction: controls < cases
res_dt2
results <- bind_rows(results, res_dt2)
Purpose: Bagging many trees reduces variance. With 100 trees, expect better AUC/recall than a single tree while keeping accuracy and precision competitive.
rf1 <- randomForest::randomForest(y ~ ., data = train, ntree = 100)
prob <- predict(rf1, newdata = test, type = "prob")[,"yes"]
pred <- predict(rf1, newdata = test, type = "response")
res_rf1 <- metrics(test$y, prob, pred) |>
mutate(Algorithm = "Random Forest", Experiment = "Trees=100")
## Setting direction: controls < cases
res_rf1
results <- bind_rows(results, res_rf1)
Purpose: Increasing to 300 trees should slightly improve or stabilize AUC (law of large numbers for bagging) with similar precision/recall.
rf2 <- randomForest::randomForest(y ~ ., data = train, ntree = 300)
prob <- predict(rf2, newdata = test, type = "prob")[,"yes"]
pred <- predict(rf2, newdata = test, type = "response")
res_rf2 <- metrics(test$y, prob, pred) |>
mutate(Algorithm = "Random Forest", Experiment = "Trees=300")
## Setting direction: controls < cases
res_rf2
results <- bind_rows(results, res_rf2)
Purpose: With 100 boosting rounds and maxdepth=3 weak learners, expect higher AUC (lower bias) and strong precision at a 0.5 threshold.
ab1 <- adabag::boosting(
y ~ ., data = train, boos = TRUE, mfinal = 100,
control = rpart.control(maxdepth = 3)
)
pred_obj <- predict(ab1, newdata = test)
votes <- pred_obj$votes
colnames(votes) <- levels(train$y)
prob <- votes[,"yes"] / rowSums(votes)
pred <- factor(pred_obj$class, levels = c("no","yes"))
res_ab1 <- metrics(test$y, prob, pred) |>
mutate(Algorithm = "AdaBoost", Experiment = "Est=100")
## Setting direction: controls < cases
res_ab1
results <- bind_rows(results, res_ab1)
Purpose: More rounds (300) should further improve AUC (best ranking) while precision stays high; recall may fluctuate at a fixed 0.5 threshold.
ab2 <- adabag::boosting(
y ~ ., data = train, boos = TRUE, mfinal = 300,
control = rpart.control(maxdepth = 3)
)
pred_obj <- predict(ab2, newdata = test)
votes <- pred_obj$votes
colnames(votes) <- levels(train$y)
prob <- votes[,"yes"] / rowSums(votes)
pred <- factor(pred_obj$class, levels = c("no","yes"))
res_ab2 <- metrics(test$y, prob, pred) |>
mutate(Algorithm = "AdaBoost", Experiment = "Est=300")
## Setting direction: controls < cases
res_ab2
results <- bind_rows(results, res_ab2)
Purpose:
- Rounds key metrics for readability.
- Adds a compact confusion matrix string.
- Presents all six experiments in a single table (rubric-friendly).
results |>
mutate(across(c(Accuracy, Precision_Yes, Recall_Yes, ROC_AUC), ~round(.x, 3))) |>
mutate(Confusion = paste0("TP=", TP, ", FP=", FP, ", FN=", FN, ", TN=", TN)) |>
select(Algorithm, Experiment, Accuracy, Precision_Yes, Recall_Yes, ROC_AUC, Confusion) |>
arrange(Algorithm, Experiment) |>
knitr::kable(caption = "Experiment Results")
| Algorithm | Experiment | Accuracy | Precision_Yes | Recall_Yes | ROC_AUC | Confusion |
|---|---|---|---|---|---|---|
| AdaBoost | Est=100 | 0.904 | 0.702 | 0.264 | 0.812 | TP=245, FP=104, FN=683, TN=7205 |
| AdaBoost | Est=300 | 0.904 | 0.701 | 0.253 | 0.818 | TP=235, FP=100, FN=693, TN=7209 |
| Decision Tree | Depth=3 | 0.901 | 0.716 | 0.202 | 0.722 | TP=187, FP=74, FN=741, TN=7235 |
| Decision Tree | Depth=8 | 0.900 | 0.622 | 0.288 | 0.785 | TP=267, FP=162, FN=661, TN=7147 |
| Random Forest | Trees=100 | 0.903 | 0.643 | 0.312 | 0.801 | TP=290, FP=161, FN=638, TN=7148 |
| Random Forest | Trees=300 | 0.903 | 0.645 | 0.309 | 0.806 | TP=287, FP=158, FN=641, TN=7151 |