Note: This notebook runs six experiments (two each for Decision Tree, Random Forest, and AdaBoost) on the UCI Bank Marketing dataset. It uses a simple 80/20 split with the duration column removed to avoid leakage. Metrics reported: Accuracy, Precision (Yes), Recall (Yes), ROC AUC.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(adabag)
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(knitr)
set.seed(42)
options(dplyr.summarise.inform = FALSE)
theme_set(theme_minimal())
file_path <- "/Users/michaelrobinson/Downloads/bank+marketing/bank-additional/bank-additional-full.csv"
bank <- read.csv(file_path, sep = ";", header = TRUE, stringsAsFactors = FALSE) %>%
clean_names()
# Ensure target as factor with positive class "yes"
bank <- bank %>% mutate(y = factor(y, levels = c("no","yes")))
# Drop leakage feature if present
if ("duration" %in% names(bank)) bank <- bank %>% select(-duration)
# Treat empty strings as NA, then convert all character predictors to factors
bank <- bank %>%
mutate(across(where(is.character), ~ na_if(.x, ""))) %>%
mutate(across(where(is.character), as.factor))
# Quick sanity checks
glimpse(bank)
## Rows: 41,188
## Columns: 20
## $ age <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job <fct> housemaid, services, services, admin., services, servic…
## $ marital <fct> married, married, married, married, married, married, m…
## $ education <fct> basic.4y, high.school, high.school, basic.6y, high.scho…
## $ default <fct> no, unknown, no, no, no, unknown, no, unknown, no, no, …
## $ housing <fct> no, no, yes, no, no, no, no, no, yes, yes, no, yes, no,…
## $ loan <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes, n…
## $ contact <fct> telephone, telephone, telephone, telephone, telephone, …
## $ month <fct> may, may, may, may, may, may, may, may, may, may, may, …
## $ day_of_week <fct> mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, …
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome <fct> nonexistent, nonexistent, nonexistent, nonexistent, non…
## $ emp_var_rate <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons_price_idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons_conf_idx <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr_employed <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
table(bank$y)
##
## no yes
## 36548 4640
sum(is.na(bank))
## [1] 0
set.seed(42)
idx <- caret::createDataPartition(bank$y, p = 0.8, list = FALSE)
train <- bank[idx, ]
test <- bank[-idx, ]
# --- FIX: harmonize factor levels so test has no "new levels" unseen in training ---
factor_cols <- setdiff(names(train)[sapply(train, is.factor)], "y")
for (col in factor_cols) {
allowed <- levels(train[[col]])
if (is.null(allowed)) next
# any values in test not seen in training?
novel_idx <- !(test[[col]] %in% allowed)
if (any(novel_idx, na.rm = TRUE)) {
fallback <- if ("unknown" %in% allowed) "unknown" else allowed[1]
test[[col]][novel_idx] <- fallback
}
# align test levels exactly to training
test[[col]] <- factor(test[[col]], levels = allowed)
}
# Helper: metrics function (keeps "yes" as positive class)
metrics <- function(y_true, prob_yes, pred_class){
auc <- as.numeric(pROC::auc(pROC::roc(y_true, prob_yes, levels = c("no","yes"))))
cm <- table(truth = y_true, pred = pred_class)
TP <- ifelse("yes" %in% rownames(cm) && "yes" %in% colnames(cm), cm["yes","yes"], 0)
FP <- ifelse("no" %in% rownames(cm) && "yes" %in% colnames(cm), cm["no","yes"], 0)
FN <- ifelse("yes" %in% rownames(cm) && "no" %in% colnames(cm), cm["yes","no"], 0)
TN <- ifelse("no" %in% rownames(cm) && "no" %in% colnames(cm), cm["no","no"], 0)
acc <- (TP + TN) / sum(cm)
prec <- ifelse((TP + FP) > 0, TP / (TP + FP), NA_real_)
rec <- ifelse((TP + FN) > 0, TP / (TP + FN), NA_real_)
data.frame(Accuracy = acc, Precision_Yes = prec, Recall_Yes = rec, ROC_AUC = auc,
TP = TP, FP = FP, FN = FN, TN = TN, stringsAsFactors = FALSE)
}
results <- tibble()
ctrl <- rpart.control(maxdepth = 3, cp = 0.0, minsplit = 20)
dt1 <- rpart::rpart(y ~ ., data = train, method = "class", control = ctrl)
prob <- predict(dt1, newdata = test, type = "prob")[,"yes"]
pred <- ifelse(prob >= 0.5, "yes", "no") %>% factor(levels = c("no","yes"))
res_dt1 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "Decision Tree", Experiment = "Depth=3")
## Setting direction: controls < cases
res_dt1
## Accuracy Precision_Yes Recall_Yes ROC_AUC TP FP FN TN Algorithm
## 1 0.9010562 0.7164751 0.2015086 0.7222933 187 74 741 7235 Decision Tree
## Experiment
## 1 Depth=3
results <- bind_rows(results, res_dt1)
ctrl <- rpart.control(maxdepth = 8, cp = 0.0, minsplit = 20)
dt2 <- rpart::rpart(y ~ ., data = train, method = "class", control = ctrl)
prob <- predict(dt2, newdata = test, type = "prob")[,"yes"]
pred <- ifelse(prob >= 0.5, "yes", "no") %>% factor(levels = c("no","yes"))
res_dt2 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "Decision Tree", Experiment = "Depth=8")
## Setting direction: controls < cases
res_dt2
## Accuracy Precision_Yes Recall_Yes ROC_AUC TP FP FN TN Algorithm
## 1 0.900085 0.6223776 0.2877155 0.7848777 267 162 661 7147 Decision Tree
## Experiment
## 1 Depth=8
results <- bind_rows(results, res_dt2)
rf1 <- randomForest::randomForest(y ~ ., data = train, ntree = 100)
prob <- predict(rf1, newdata = test, type = "prob")[,"yes"]
pred <- predict(rf1, newdata = test, type = "response")
res_rf1 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "Random Forest", Experiment = "Trees=100")
## Setting direction: controls < cases
res_rf1
## Accuracy Precision_Yes Recall_Yes ROC_AUC TP FP FN TN Algorithm
## 1 0.9029987 0.6430155 0.3125 0.8008831 290 161 638 7148 Random Forest
## Experiment
## 1 Trees=100
results <- bind_rows(results, res_rf1)
rf2 <- randomForest::randomForest(y ~ ., data = train, ntree = 300)
prob <- predict(rf2, newdata = test, type = "prob")[,"yes"]
pred <- predict(rf2, newdata = test, type = "response")
res_rf2 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "Random Forest", Experiment = "Trees=300")
## Setting direction: controls < cases
res_rf2
## Accuracy Precision_Yes Recall_Yes ROC_AUC TP FP FN TN Algorithm
## 1 0.9029987 0.6449438 0.3092672 0.8062519 287 158 641 7151 Random Forest
## Experiment
## 1 Trees=300
results <- bind_rows(results, res_rf2)
ab1 <- adabag::boosting(y ~ ., data = train, boos = TRUE, mfinal = 100,
control = rpart.control(maxdepth = 3))
pred_obj <- predict(ab1, newdata = test)
# Convert votes to Pr(yes)
votes <- pred_obj$votes
colnames(votes) <- levels(train$y)
prob <- votes[,"yes"] / rowSums(votes)
pred <- factor(pred_obj$class, levels = c("no","yes"))
res_ab1 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "AdaBoost", Experiment = "Est=100")
## Setting direction: controls < cases
res_ab1
## Accuracy Precision_Yes Recall_Yes ROC_AUC TP FP FN TN Algorithm
## 1 0.9044555 0.7020057 0.2640086 0.8115768 245 104 683 7205 AdaBoost
## Experiment
## 1 Est=100
results <- bind_rows(results, res_ab1)
ab2 <- adabag::boosting(y ~ ., data = train, boos = TRUE, mfinal = 300,
control = rpart.control(maxdepth = 3))
pred_obj <- predict(ab2, newdata = test)
votes <- pred_obj$votes
colnames(votes) <- levels(train$y)
prob <- votes[,"yes"] / rowSums(votes)
pred <- factor(pred_obj$class, levels = c("no","yes"))
res_ab2 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "AdaBoost", Experiment = "Est=300")
## Setting direction: controls < cases
res_ab2
## Accuracy Precision_Yes Recall_Yes ROC_AUC TP FP FN TN Algorithm
## 1 0.9037271 0.7014925 0.2532328 0.8183554 235 100 693 7209 AdaBoost
## Experiment
## 1 Est=300
results <- bind_rows(results, res_ab2)
results %>%
mutate(across(c(Accuracy, Precision_Yes, Recall_Yes, ROC_AUC), ~round(.x, 3))) %>%
mutate(Confusion = paste0("TP=", TP, ", FP=", FP, ", FN=", FN, ", TN=", TN)) %>%
select(Algorithm, Experiment, Accuracy, Precision_Yes, Recall_Yes, ROC_AUC, Confusion) %>%
arrange(Algorithm, Experiment) %>%
knitr::kable(caption = "Experiment Results on Hold-out Test Set")
| Algorithm | Experiment | Accuracy | Precision_Yes | Recall_Yes | ROC_AUC | Confusion |
|---|---|---|---|---|---|---|
| AdaBoost | Est=100 | 0.904 | 0.702 | 0.264 | 0.812 | TP=245, FP=104, FN=683, TN=7205 |
| AdaBoost | Est=300 | 0.904 | 0.701 | 0.253 | 0.818 | TP=235, FP=100, FN=693, TN=7209 |
| Decision Tree | Depth=3 | 0.901 | 0.716 | 0.202 | 0.722 | TP=187, FP=74, FN=741, TN=7235 |
| Decision Tree | Depth=8 | 0.900 | 0.622 | 0.288 | 0.785 | TP=267, FP=162, FN=661, TN=7147 |
| Random Forest | Trees=100 | 0.903 | 0.643 | 0.312 | 0.801 | TP=290, FP=161, FN=638, TN=7148 |
| Random Forest | Trees=300 | 0.903 | 0.645 | 0.309 | 0.806 | TP=287, FP=158, FN=641, TN=7151 |