library(tidyverse)
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(ada)
## Warning: package 'ada' was built under R version 4.4.3
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
set.seed(123)
# Download the zip file (into a temporary file)
temp_file <- tempfile(fileext = ".zip")
download.file("https://archive.ics.uci.edu/static/public/222/bank+marketing.zip",
destfile = temp_file, mode = "wb")
# List contents of the ZIP (to see filenames)
zip_contents <- unzip(temp_file, list = TRUE)
# Unzip the specific CSV file
csv_name <- zip_contents$Name[1] # pick first or appropriate
unzip(temp_file, files = csv_name, exdir = tempdir())
csv_path <- file.path(tempdir(), csv_name)
# Read Data
bank_data <- read_delim(csv_path, delim = ";")
## Multiple files in zip: reading 'bank-full.csv'
## Rows: 45211 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (10): job, marital, education, default, housing, loan, contact, month, p...
## dbl (7): age, balance, day, duration, campaign, pdays, previous
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Overview
glimpse(bank_data)
## Rows: 45,211
## Columns: 17
## $ age <dbl> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
# Identifying and remove duplicates
duplicate_count <- sum(duplicated(bank_data))
bank_data <- bank_data %>% distinct()
# Clean up str and Inconsistencies of whitespace
bank_data <- bank_data %>%
mutate_if(is.character, str_trim) %>% # remove extra spaces
mutate(across(where(is.character), tolower))
# Handling missing values with median imputation involve numeric colums
bank_data <- bank_data %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
# Converting Categorical featues to factors
bank_data <- bank_data %>%
mutate(across(where(is.character), as.factor))
# Scale numeric features to ensure comparability across variables
num_cols <- sapply(bank_data, is.numeric)
bank_data[num_cols] <- scale(bank_data[num_cols])
# Train- Test Split
set.seed(123)
trainIndex <- createDataPartition(bank_data$y, p = 0.8, list = FALSE)
train_bank_data <- bank_data[trainIndex, ]
test_bank_data <- bank_data[-trainIndex, ]
# Summary Checks
cat("Duplicates removed:", duplicate_count, "\n")
## Duplicates removed: 0
cat("Training set size:", nrow(train_bank_data), "\n")
## Training set size: 36170
cat("Test set size:", nrow(test_bank_data), "\n")
## Test set size: 9041
This phase, which follows the EDA in Part 1, is devoted to model testing with the Portuguese Bank Marketing dataset. The objective is to determine which algorithm better predicts client subscription to term deposits by comparing the AdaBoost, Random Forest, and Decision Tree classifiers. Increasing subscriber conversion rates while lowering outreach expenses.
To evaluate bias-variance tradeoffs and business KPIs like precision, recall, and AUC-ROC—all of which contribute to improved marketing effectiveness and conversion forecast accuracy—each algorithm will undergo two tests with parameters systematically changed.
“Systematic experimentation and tuning, rather than model complexity alone, are essential for machine learning success,” as Lantz (2024) highlights.
# Ensuring Reproducibility
train_bank_data$y <- relevel(train_bank_data$y, ref = "no")
test_bank_data$y <- relevel(test_bank_data$y, ref = "no")
eval_model <- function(truth, pred_class, pred_prob, positive = "yes") {
cm <- caret::confusionMatrix(pred_class, truth, positive = positive)
# compute AUC if probabilities provided
auc_val <- NA
if (!is.null(pred_prob)) {
roc_obj <- try(pROC::roc(response = truth, predictor = pred_prob, levels = rev(levels(truth))), silent = TRUE)
if (!inherits(roc_obj, "try-error")) auc_val <- as.numeric(pROC::auc(roc_obj))
}
tibble::tibble(
Accuracy = as.numeric(cm$overall["Accuracy"]),
Kappa = as.numeric(cm$overall["Kappa"]),
Precision = as.numeric(cm$byClass["Pos Pred Value"]),
Recall = as.numeric(cm$byClass["Sensitivity"]),
F1 = as.numeric(cm$byClass["F1"]),
AUC = auc_val
)
}
# Container for storing hyperparameters and results
experiment_log <- tibble::tibble(
Algorithm = character(),
Experiment = character(),
Seed = integer(),
Hyperparameters = character(),
Accuracy = numeric(),
Kappa = numeric(),
Precision = numeric(),
Recall = numeric(),
F1 = numeric(),
AUC = numeric()
)
set.seed(123)
dt1_params <- list(method = "rpart", control = "default")
dt1 <- rpart::rpart(y ~ ., data = train_bank_data, method = "class") # baseline
# predictions
dt1_pred_class <- predict(dt1, test_bank_data, type = "class")
dt1_pred_prob <- predict(dt1, test_bank_data, type = "prob")[, "yes"]
# evaluate
dt1_metrics <- eval_model(test_bank_data$y, dt1_pred_class, dt1_pred_prob)
## Setting direction: controls > cases
# log
experiment_log <- experiment_log %>%
tibble::add_row(
Algorithm = "Decision Tree",
Experiment = "Baseline",
Seed = 123,
Hyperparameters = paste0("rpart default"),
Accuracy = dt1_metrics$Accuracy,
Kappa = dt1_metrics$Kappa,
Precision = dt1_metrics$Precision,
Recall = dt1_metrics$Recall,
F1 = dt1_metrics$F1,
AUC = dt1_metrics$AUC
)
cat("DT1 (Baseline): Default Rpart. Note the significant training fit found in prior EDAs.
Expect more volatility.\n\n")
## DT1 (Baseline): Default Rpart. Note the significant training fit found in prior EDAs.
## Expect more volatility.
set.seed(123)
dt2_params <- list(maxdepth = 5, minsplit = 20)
dt2 <- rpart::rpart(y ~ ., data = train_bank_data, method = "class",
control = rpart.control(maxdepth = dt2_params$maxdepth, minsplit = dt2_params$minsplit))
dt2_pred_class <- predict(dt2, test_bank_data, type = "class")
dt2_pred_prob <- predict(dt2, test_bank_data, type = "prob")[, "yes"]
dt2_metrics <- eval_model(test_bank_data$y, dt2_pred_class, dt2_pred_prob)
## Setting direction: controls > cases
experiment_log <- experiment_log %>%
tibble::add_row(
Algorithm = "Decision Tree",
Experiment = "Pruned (maxdepth=5, minsplit=20)",
Seed = 123,
Hyperparameters = paste0("maxdepth=", dt2_params$maxdepth, "; minsplit=", dt2_params$minsplit),
Accuracy = dt2_metrics$Accuracy,
Kappa = dt2_metrics$Kappa,
Precision = dt2_metrics$Precision,
Recall = dt2_metrics$Recall,
F1 = dt2_metrics$F1,
AUC = dt2_metrics$AUC
)
cat("DT2 (Pruned): depth is restricted; Reduced volatility and enhanced F1 are predicted.
Suitable for regulations that are easily interpreted.\n\n")
## DT2 (Pruned): depth is restricted; Reduced volatility and enhanced F1 are predicted.
## Suitable for regulations that are easily interpreted.
set.seed(456)
rf1_params <- list(ntree = 100, mtry = floor(sqrt(ncol(train_bank_data) - 1)))
rf1 <- randomForest::randomForest(y ~ ., data = train_bank_data, ntree = rf1_params$ntree, mtry = rf1_params$mtry, importance = TRUE)
rf1_pred_class <- predict(rf1, test_bank_data, type = "response")
rf1_pred_prob <- predict(rf1, test_bank_data, type = "prob")[, "yes"]
rf1_metrics <- eval_model(test_bank_data$y, rf1_pred_class, rf1_pred_prob)
## Setting direction: controls > cases
experiment_log <- experiment_log %>%
tibble::add_row(
Algorithm = "Random Forest",
Experiment = "100 trees",
Seed = 456,
Hyperparameters = paste0("ntree=", rf1_params$ntree, "; mtry=", rf1_params$mtry),
Accuracy = rf1_metrics$Accuracy,
Kappa = rf1_metrics$Kappa,
Precision = rf1_metrics$Precision,
Recall = rf1_metrics$Recall,
F1 = rf1_metrics$F1,
AUC = rf1_metrics$AUC
)
cat("RF1 has 100 trees. Ensemble minimizes variation and enhances recall (a useful KPI for marketing. \n\n")
## RF1 has 100 trees. Ensemble minimizes variation and enhances recall (a useful KPI for marketing.
set.seed(456)
rf2_params <- list(ntree = 300, maxnodes = 50, mtry = rf1_params$mtry)
rf2 <- randomForest::randomForest(y ~ ., data = train_bank_data, ntree = rf2_params$ntree, mtry = rf2_params$mtry, maxnodes = rf2_params$maxnodes, importance = TRUE)
rf2_pred_class <- predict(rf2, test_bank_data, type = "response")
rf2_pred_prob <- predict(rf2, test_bank_data, type = "prob")[, "yes"]
rf2_metrics <- eval_model(test_bank_data$y, rf2_pred_class, rf2_pred_prob)
## Setting direction: controls > cases
experiment_log <- experiment_log %>%
tibble::add_row(
Algorithm = "Random Forest",
Experiment = "300 trees (maxnodes=50)",
Seed = 456,
Hyperparameters = paste0("ntree=", rf2_params$ntree, "; maxnodes=", rf2_params$maxnodes, "; mtry=", rf2_params$mtry),
Accuracy = rf2_metrics$Accuracy,
Kappa = rf2_metrics$Kappa,
Precision = rf2_metrics$Precision,
Recall = rf2_metrics$Recall,
F1 = rf2_metrics$F1,
AUC = rf2_metrics$AUC
)
cat("RF 2: 300 trees with limited nodes. Improved stability and, in many cases, better generalization than the baseline.\n\n")
## RF 2: 300 trees with limited nodes. Improved stability and, in many cases, better generalization than the baseline.
set.seed(789)
ada1_params <- list(iter = 50, nu = 1.0)
ada1 <- ada::ada(y ~ ., data = train_bank_data, iter = ada1_params$iter, nu = ada1_params$nu, type = "discrete")
# Ada::predict occasionally yields different types; when type="class" and "prob" are available, use them.
ada1_pred_class <- predict(ada1, test_bank_data, type = "vector") # class labels (vector)
# Some ADA implementations return probabilities that are either type="vector" or "probs"; try probabilistic extraction.
ada1_pred_prob <- tryCatch(predict(ada1, test_bank_data, type = "prob")[, "yes"], error = function(e) {
# fallback: employ the predicted class with a 0/1 probability (not optimal, but maintains pipeline stability).
as.numeric(ada1_pred_class == "yes")
})
ada1_metrics <- eval_model(test_bank_data$y, factor(ada1_pred_class, levels = levels(test_bank_data$y)), ada1_pred_prob)
## Setting direction: controls < cases
experiment_log <- experiment_log %>%
tibble::add_row(
Algorithm = "AdaBoost",
Experiment = "50 learners (nu=1.0)",
Seed = 789,
Hyperparameters = paste0("iter=", ada1_params$iter, "; nu=", ada1_params$nu),
Accuracy = ada1_metrics$Accuracy,
Kappa = ada1_metrics$Kappa,
Precision = ada1_metrics$Precision,
Recall = ada1_metrics$Recall,
F1 = ada1_metrics$F1,
AUC = ada1_metrics$AUC
)
cat("Ada1: 50 learners, higher learning rate; reduces bias but watch variance. Good baseline for boosting.\n\n")
## Ada1: 50 learners, higher learning rate; reduces bias but watch variance. Good baseline for boosting.
set.seed(789)
ada2_params <- list(iter = 200, nu = 0.5)
ada2 <- ada::ada(y ~ ., data = train_bank_data, iter = ada2_params$iter, nu = ada2_params$nu, type = "discrete")
ada2_pred_class <- predict(ada2, test_bank_data, type = "vector")
ada2_pred_prob <- tryCatch(predict(ada2, test_bank_data, type = "prob")[, "yes"], error = function(e) {
as.numeric(ada2_pred_class == "yes")
})
ada2_metrics <- eval_model(test_bank_data$y, factor(ada2_pred_class, levels = levels(test_bank_data$y)), ada2_pred_prob)
## Setting direction: controls < cases
experiment_log <- experiment_log %>%
tibble::add_row(
Algorithm = "AdaBoost",
Experiment = "200 learners (nu=0.5)",
Seed = 789,
Hyperparameters = paste0("iter=", ada2_params$iter, "; nu=", ada2_params$nu),
Accuracy = ada2_metrics$Accuracy,
Kappa = ada2_metrics$Kappa,
Precision = ada2_metrics$Precision,
Recall = ada2_metrics$Recall,
F1 = ada2_metrics$F1,
AUC = ada2_metrics$AUC
)
cat("Ada2: 200 learners, slower learning rate - usually decreases variation and increases generalization. Strong candidate.\n\n")
## Ada2: 200 learners, slower learning rate - usually decreases variation and increases generalization. Strong candidate.
# Print results table
experiment_log <- experiment_log %>%
dplyr::mutate(
Accuracy = round(Accuracy, 4),
Kappa = round(Kappa, 4),
Precision = round(Precision, 4),
Recall = round(Recall, 4),
F1 = round(F1, 4),
AUC = round(AUC, 4)
)
knitr::kable(experiment_log, caption = "Experimentation Log: metrics and hyperparameters", align = "l")
| Algorithm | Experiment | Seed | Hyperparameters | Accuracy | Kappa | Precision | Recall | F1 | AUC |
|---|---|---|---|---|---|---|---|---|---|
| Decision Tree | Baseline | 123 | rpart default | 0.8978 | 0.3805 | 0.6173 | 0.3311 | 0.4310 | 0.7227 |
| Decision Tree | Pruned (maxdepth=5, minsplit=20) | 123 | maxdepth=5; minsplit=20 | 0.8978 | 0.3805 | 0.6173 | 0.3311 | 0.4310 | 0.7227 |
| Random Forest | 100 trees | 456 | ntree=100; mtry=4 | 0.9070 | 0.4917 | 0.6385 | 0.4711 | 0.5422 | 0.9271 |
| Random Forest | 300 trees (maxnodes=50) | 456 | ntree=300; maxnodes=50; mtry=4 | 0.8975 | 0.2771 | 0.7211 | 0.2006 | 0.3138 | 0.8866 |
| AdaBoost | 50 learners (nu=1.0) | 789 | iter=50; nu=1 | 0.8991 | 0.4635 | 0.5864 | 0.4655 | 0.5190 | 0.2890 |
| AdaBoost | 200 learners (nu=0.5) | 789 | iter=200; nu=0.5 | 0.9028 | 0.4707 | 0.6129 | 0.4570 | 0.5236 | 0.2906 |
# Plot: F1 by experiment for quick visual comparison
library(ggplot2)
ggplot(experiment_log, aes(x = Experiment, y = F1, fill = Algorithm)) +
geom_col(position = position_dodge()) +
coord_flip() +
labs(title = "F1-score by Experiment", y = "F1-score", x = "")