Load Library

library(tidyverse)
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(ada)
## Warning: package 'ada' was built under R version 4.4.3
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
set.seed(123)

Load Data from UCI

# Download the zip file (into a temporary file)
temp_file <- tempfile(fileext = ".zip")
download.file("https://archive.ics.uci.edu/static/public/222/bank+marketing.zip", 
              destfile = temp_file, mode = "wb")

# List contents of the ZIP (to see filenames)
zip_contents <- unzip(temp_file, list = TRUE)

#  Unzip the specific CSV file
csv_name <- zip_contents$Name[1]  # pick first or appropriate
unzip(temp_file, files = csv_name, exdir = tempdir())
csv_path <- file.path(tempdir(), csv_name)

# Read Data
bank_data <- read_delim(csv_path, delim = ";")
## Multiple files in zip: reading 'bank-full.csv'
## Rows: 45211 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (10): job, marital, education, default, housing, loan, contact, month, p...
## dbl  (7): age, balance, day, duration, campaign, pdays, previous
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Overview 
glimpse(bank_data)
## Rows: 45,211
## Columns: 17
## $ age       <dbl> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job       <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital   <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance   <dbl> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing   <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan      <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact   <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day       <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month     <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration  <dbl> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays     <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome  <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …

Inspecting Quality and Cleaning

# Identifying and remove duplicates
duplicate_count <- sum(duplicated(bank_data))
bank_data <- bank_data %>% distinct()

# Clean up str and Inconsistencies of whitespace
bank_data <- bank_data %>%
  mutate_if(is.character, str_trim) %>%  # remove extra spaces
  mutate(across(where(is.character), tolower))

# Handling missing values with median imputation involve numeric colums
bank_data <- bank_data %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

# Converting Categorical featues to factors
bank_data <- bank_data %>%
  mutate(across(where(is.character), as.factor))

# Scale numeric features to ensure comparability across variables
num_cols <- sapply(bank_data, is.numeric)
bank_data[num_cols] <- scale(bank_data[num_cols])
# Train- Test Split
set.seed(123)
trainIndex <- createDataPartition(bank_data$y, p = 0.8, list = FALSE)
train_bank_data <- bank_data[trainIndex, ]
test_bank_data  <- bank_data[-trainIndex, ]

# Summary Checks

cat("Duplicates removed:", duplicate_count, "\n")
## Duplicates removed: 0
cat("Training set size:", nrow(train_bank_data), "\n")
## Training set size: 36170
cat("Test set size:", nrow(test_bank_data), "\n")
## Test set size: 9041

Introduction

This phase, which follows the EDA in Part 1, is devoted to model testing with the Portuguese Bank Marketing dataset. The objective is to determine which algorithm better predicts client subscription to term deposits by comparing the AdaBoost, Random Forest, and Decision Tree classifiers. Increasing subscriber conversion rates while lowering outreach expenses.

To evaluate bias-variance tradeoffs and business KPIs like precision, recall, and AUC-ROC—all of which contribute to improved marketing effectiveness and conversion forecast accuracy—each algorithm will undergo two tests with parameters systematically changed.

“Systematic experimentation and tuning, rather than model complexity alone, are essential for machine learning success,” as Lantz (2024) highlights.

# Ensuring Reproducibility
train_bank_data$y <- relevel(train_bank_data$y, ref = "no")
test_bank_data$y  <- relevel(test_bank_data$y, ref = "no")

function to calculate core metrics + AUC

eval_model <- function(truth, pred_class, pred_prob, positive = "yes") {
  cm <- caret::confusionMatrix(pred_class, truth, positive = positive)
  # compute AUC if probabilities provided
  auc_val <- NA
  if (!is.null(pred_prob)) {
    roc_obj <- try(pROC::roc(response = truth, predictor = pred_prob, levels = rev(levels(truth))), silent = TRUE)
    if (!inherits(roc_obj, "try-error")) auc_val <- as.numeric(pROC::auc(roc_obj))
  }
   tibble::tibble(
    Accuracy = as.numeric(cm$overall["Accuracy"]),
    Kappa    = as.numeric(cm$overall["Kappa"]),
    Precision = as.numeric(cm$byClass["Pos Pred Value"]),
    Recall    = as.numeric(cm$byClass["Sensitivity"]),
    F1        = as.numeric(cm$byClass["F1"]),
    AUC       = auc_val
  )
}
# Container for storing hyperparameters and results
experiment_log <- tibble::tibble(
  Algorithm = character(),
  Experiment = character(),
  Seed = integer(),
  Hyperparameters = character(),
  Accuracy = numeric(),
  Kappa = numeric(),
  Precision = numeric(),
  Recall = numeric(),
  F1 = numeric(),
  AUC = numeric()
)

Decision Tree Exp 1

set.seed(123)
dt1_params <- list(method = "rpart", control = "default")
dt1 <- rpart::rpart(y ~ ., data = train_bank_data, method = "class")  # baseline

# predictions
dt1_pred_class <- predict(dt1, test_bank_data, type = "class")
dt1_pred_prob  <- predict(dt1, test_bank_data, type = "prob")[, "yes"]


# evaluate
dt1_metrics <- eval_model(test_bank_data$y, dt1_pred_class, dt1_pred_prob)
## Setting direction: controls > cases
# log
experiment_log <- experiment_log %>%
  tibble::add_row(
    Algorithm = "Decision Tree",
    Experiment = "Baseline",
    Seed = 123,
    Hyperparameters = paste0("rpart default"),
    Accuracy = dt1_metrics$Accuracy,
    Kappa = dt1_metrics$Kappa,
    Precision = dt1_metrics$Precision,
    Recall = dt1_metrics$Recall,
    F1 = dt1_metrics$F1,
    AUC = dt1_metrics$AUC
  )

cat("DT1 (Baseline): Default Rpart. Note the significant training fit found in prior EDAs. 
Expect more volatility.\n\n")
## DT1 (Baseline): Default Rpart. Note the significant training fit found in prior EDAs. 
## Expect more volatility.

Decision Tree - Exp 2 (Pruned)

set.seed(123)
dt2_params <- list(maxdepth = 5, minsplit = 20)
dt2 <- rpart::rpart(y ~ ., data = train_bank_data, method = "class",
                    control = rpart.control(maxdepth = dt2_params$maxdepth, minsplit = dt2_params$minsplit))

dt2_pred_class <- predict(dt2, test_bank_data, type = "class")
dt2_pred_prob  <- predict(dt2, test_bank_data, type = "prob")[, "yes"]

dt2_metrics <- eval_model(test_bank_data$y, dt2_pred_class, dt2_pred_prob)
## Setting direction: controls > cases
experiment_log <- experiment_log %>%
  tibble::add_row(
    Algorithm = "Decision Tree",
    Experiment = "Pruned (maxdepth=5, minsplit=20)",
    Seed = 123,
    Hyperparameters = paste0("maxdepth=", dt2_params$maxdepth, "; minsplit=", dt2_params$minsplit),
    Accuracy = dt2_metrics$Accuracy,
    Kappa = dt2_metrics$Kappa,
    Precision = dt2_metrics$Precision,
    Recall = dt2_metrics$Recall,
    F1 = dt2_metrics$F1,
    AUC = dt2_metrics$AUC
  )

cat("DT2 (Pruned): depth is restricted; Reduced volatility and enhanced F1 are predicted. 
Suitable for regulations that are easily interpreted.\n\n")
## DT2 (Pruned): depth is restricted; Reduced volatility and enhanced F1 are predicted. 
## Suitable for regulations that are easily interpreted.

Random Forest - Exp 1 (100 trees)

set.seed(456)
rf1_params <- list(ntree = 100, mtry = floor(sqrt(ncol(train_bank_data) - 1)))
rf1 <- randomForest::randomForest(y ~ ., data = train_bank_data, ntree = rf1_params$ntree, mtry = rf1_params$mtry, importance = TRUE)

rf1_pred_class <- predict(rf1, test_bank_data, type = "response")
rf1_pred_prob  <- predict(rf1, test_bank_data, type = "prob")[, "yes"]

rf1_metrics <- eval_model(test_bank_data$y, rf1_pred_class, rf1_pred_prob)
## Setting direction: controls > cases
experiment_log <- experiment_log %>%
  tibble::add_row(
    Algorithm = "Random Forest",
    Experiment = "100 trees",
    Seed = 456,
    Hyperparameters = paste0("ntree=", rf1_params$ntree, "; mtry=", rf1_params$mtry),
    Accuracy = rf1_metrics$Accuracy,
    Kappa = rf1_metrics$Kappa,
    Precision = rf1_metrics$Precision,
    Recall = rf1_metrics$Recall,
    F1 = rf1_metrics$F1,
    AUC = rf1_metrics$AUC
  )

cat("RF1 has 100 trees. Ensemble minimizes variation and enhances recall (a useful KPI for marketing. \n\n")
## RF1 has 100 trees. Ensemble minimizes variation and enhances recall (a useful KPI for marketing.

Random Forest - Exp 2 (300 trees, maxnodes)

set.seed(456)
rf2_params <- list(ntree = 300, maxnodes = 50, mtry = rf1_params$mtry)
rf2 <- randomForest::randomForest(y ~ ., data = train_bank_data, ntree = rf2_params$ntree, mtry = rf2_params$mtry, maxnodes = rf2_params$maxnodes, importance = TRUE)

rf2_pred_class <- predict(rf2, test_bank_data, type = "response")
rf2_pred_prob  <- predict(rf2, test_bank_data, type = "prob")[, "yes"]

rf2_metrics <- eval_model(test_bank_data$y, rf2_pred_class, rf2_pred_prob)
## Setting direction: controls > cases
experiment_log <- experiment_log %>%
  tibble::add_row(
    Algorithm = "Random Forest",
    Experiment = "300 trees (maxnodes=50)",
    Seed = 456,
    Hyperparameters = paste0("ntree=", rf2_params$ntree, "; maxnodes=", rf2_params$maxnodes, "; mtry=", rf2_params$mtry),
    Accuracy = rf2_metrics$Accuracy,
    Kappa = rf2_metrics$Kappa,
    Precision = rf2_metrics$Precision,
    Recall = rf2_metrics$Recall,
    F1 = rf2_metrics$F1,
    AUC = rf2_metrics$AUC
  )

cat("RF 2: 300 trees with limited nodes. Improved stability and, in many cases, better generalization than the baseline.\n\n")
## RF 2: 300 trees with limited nodes. Improved stability and, in many cases, better generalization than the baseline.

AdaBoost - Exp 1 (50 learners)

set.seed(789)
ada1_params <- list(iter = 50, nu = 1.0)
ada1 <- ada::ada(y ~ ., data = train_bank_data, iter = ada1_params$iter, nu = ada1_params$nu, type = "discrete") 

# Ada::predict occasionally yields different types; when type="class" and "prob" are available, use them.
ada1_pred_class <- predict(ada1, test_bank_data, type = "vector")  # class labels (vector)
# Some ADA implementations return probabilities that are either type="vector" or "probs"; try probabilistic extraction.
ada1_pred_prob <- tryCatch(predict(ada1, test_bank_data, type = "prob")[, "yes"], error = function(e) {
 # fallback: employ the predicted class with a 0/1 probability (not optimal, but maintains pipeline stability).
  as.numeric(ada1_pred_class == "yes")
})

ada1_metrics <- eval_model(test_bank_data$y, factor(ada1_pred_class, levels = levels(test_bank_data$y)), ada1_pred_prob)
## Setting direction: controls < cases
experiment_log <- experiment_log %>%
  tibble::add_row(
    Algorithm = "AdaBoost",
    Experiment = "50 learners (nu=1.0)",
    Seed = 789,
    Hyperparameters = paste0("iter=", ada1_params$iter, "; nu=", ada1_params$nu),
    Accuracy = ada1_metrics$Accuracy,
    Kappa = ada1_metrics$Kappa,
    Precision = ada1_metrics$Precision,
    Recall = ada1_metrics$Recall,
    F1 = ada1_metrics$F1,
    AUC = ada1_metrics$AUC
  )

cat("Ada1: 50 learners, higher learning rate; reduces bias but watch variance. Good baseline for boosting.\n\n")
## Ada1: 50 learners, higher learning rate; reduces bias but watch variance. Good baseline for boosting.

AdaBoost - Exp 2 (200 learners, smaller nu)

set.seed(789)
ada2_params <- list(iter = 200, nu = 0.5)
ada2 <- ada::ada(y ~ ., data = train_bank_data, iter = ada2_params$iter, nu = ada2_params$nu, type = "discrete")

ada2_pred_class <- predict(ada2, test_bank_data, type = "vector")
ada2_pred_prob  <- tryCatch(predict(ada2, test_bank_data, type = "prob")[, "yes"], error = function(e) {
  as.numeric(ada2_pred_class == "yes")
})

ada2_metrics <- eval_model(test_bank_data$y, factor(ada2_pred_class, levels = levels(test_bank_data$y)), ada2_pred_prob)
## Setting direction: controls < cases
experiment_log <- experiment_log %>%
  tibble::add_row(
    Algorithm = "AdaBoost",
    Experiment = "200 learners (nu=0.5)",
    Seed = 789,
    Hyperparameters = paste0("iter=", ada2_params$iter, "; nu=", ada2_params$nu),
    Accuracy = ada2_metrics$Accuracy,
    Kappa = ada2_metrics$Kappa,
    Precision = ada2_metrics$Precision,
    Recall = ada2_metrics$Recall,
    F1 = ada2_metrics$F1,
    AUC = ada2_metrics$AUC
  )

cat("Ada2: 200 learners, slower learning rate - usually decreases variation and increases generalization. Strong candidate.\n\n")
## Ada2: 200 learners, slower learning rate - usually decreases variation and increases generalization. Strong candidate.

Results Table and quick Visual

# Print results table
experiment_log <- experiment_log %>%
  dplyr::mutate(
    Accuracy = round(Accuracy, 4),
    Kappa = round(Kappa, 4),
    Precision = round(Precision, 4),
    Recall = round(Recall, 4),
    F1 = round(F1, 4),
    AUC = round(AUC, 4)
  )

knitr::kable(experiment_log, caption = "Experimentation Log: metrics and hyperparameters", align = "l")
Experimentation Log: metrics and hyperparameters
Algorithm Experiment Seed Hyperparameters Accuracy Kappa Precision Recall F1 AUC
Decision Tree Baseline 123 rpart default 0.8978 0.3805 0.6173 0.3311 0.4310 0.7227
Decision Tree Pruned (maxdepth=5, minsplit=20) 123 maxdepth=5; minsplit=20 0.8978 0.3805 0.6173 0.3311 0.4310 0.7227
Random Forest 100 trees 456 ntree=100; mtry=4 0.9070 0.4917 0.6385 0.4711 0.5422 0.9271
Random Forest 300 trees (maxnodes=50) 456 ntree=300; maxnodes=50; mtry=4 0.8975 0.2771 0.7211 0.2006 0.3138 0.8866
AdaBoost 50 learners (nu=1.0) 789 iter=50; nu=1 0.8991 0.4635 0.5864 0.4655 0.5190 0.2890
AdaBoost 200 learners (nu=0.5) 789 iter=200; nu=0.5 0.9028 0.4707 0.6129 0.4570 0.5236 0.2906
# Plot: F1 by experiment for quick visual comparison
library(ggplot2)
ggplot(experiment_log, aes(x = Experiment, y = F1, fill = Algorithm)) +
  geom_col(position = position_dodge()) +
  coord_flip() +
  labs(title = "F1-score by Experiment", y = "F1-score", x = "")