Assignment 2 — Experimentation & Model Training (R)

Note: This notebook runs six experiments (two each for Decision Tree, Random Forest, and AdaBoost) on the UCI Bank Marketing dataset. It uses a simple 80/20 split with the duration column removed to avoid leakage. Metrics reported: Accuracy, Precision (Yes), Recall (Yes), ROC AUC.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(rpart)
library(randomForest)

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(adabag)

## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel

library(knitr)

set.seed(42)
options(dplyr.summarise.inform = FALSE)
theme_set(theme_minimal())

Data Import

file_path <- "/Users/michaelrobinson/Downloads/bank+marketing/bank-additional/bank-additional-full.csv"

bank <- read.csv(file_path, sep = ";", header = TRUE, stringsAsFactors = FALSE) %>%
  clean_names()

# Ensure target as factor with positive class "yes"
bank <- bank %>% mutate(y = factor(y, levels = c("no","yes")))

# Drop leakage feature if present
if ("duration" %in% names(bank)) bank <- bank %>% select(-duration)

# Treat empty strings as NA, then convert all character predictors to factors
bank <- bank %>%
  mutate(across(where(is.character), ~ na_if(.x, ""))) %>%
  mutate(across(where(is.character), as.factor))

# Quick sanity checks
glimpse(bank)

## Rows: 41,188
## Columns: 20
## $ age            <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job            <fct> housemaid, services, services, admin., services, servic…
## $ marital        <fct> married, married, married, married, married, married, m…
## $ education      <fct> basic.4y, high.school, high.school, basic.6y, high.scho…
## $ default        <fct> no, unknown, no, no, no, unknown, no, unknown, no, no, …
## $ housing        <fct> no, no, yes, no, no, no, no, no, yes, yes, no, yes, no,…
## $ loan           <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes, n…
## $ contact        <fct> telephone, telephone, telephone, telephone, telephone, …
## $ month          <fct> may, may, may, may, may, may, may, may, may, may, may, …
## $ day_of_week    <fct> mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, …
## $ campaign       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays          <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome       <fct> nonexistent, nonexistent, nonexistent, nonexistent, non…
## $ emp_var_rate   <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons_price_idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons_conf_idx  <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m      <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr_employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y              <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…

table(bank$y)

## 
##    no   yes 
## 36548  4640

sum(is.na(bank))

## [1] 0

Train/Test Split (Stratified) + Helpers

set.seed(42)
idx <- caret::createDataPartition(bank$y, p = 0.8, list = FALSE)
train <- bank[idx, ]
test  <- bank[-idx, ]

# --- FIX: harmonize factor levels so test has no "new levels" unseen in training ---
factor_cols <- setdiff(names(train)[sapply(train, is.factor)], "y")

for (col in factor_cols) {
  allowed <- levels(train[[col]])
  if (is.null(allowed)) next
  # any values in test not seen in training?
  novel_idx <- !(test[[col]] %in% allowed)
  if (any(novel_idx, na.rm = TRUE)) {
    fallback <- if ("unknown" %in% allowed) "unknown" else allowed[1]
    test[[col]][novel_idx] <- fallback
  }
  # align test levels exactly to training
  test[[col]] <- factor(test[[col]], levels = allowed)
}

# Helper: metrics function (keeps "yes" as positive class)
metrics <- function(y_true, prob_yes, pred_class){
  auc <- as.numeric(pROC::auc(pROC::roc(y_true, prob_yes, levels = c("no","yes"))))
  cm <- table(truth = y_true, pred = pred_class)
  TP <- ifelse("yes" %in% rownames(cm) && "yes" %in% colnames(cm), cm["yes","yes"], 0)
  FP <- ifelse("no"  %in% rownames(cm) && "yes" %in% colnames(cm), cm["no","yes"], 0)
  FN <- ifelse("yes" %in% rownames(cm) && "no"  %in% colnames(cm), cm["yes","no"], 0)
  TN <- ifelse("no"  %in% rownames(cm) && "no"  %in% colnames(cm), cm["no","no"], 0)
  acc <- (TP + TN) / sum(cm)
  prec <- ifelse((TP + FP) > 0, TP / (TP + FP), NA_real_)
  rec  <- ifelse((TP + FN) > 0, TP / (TP + FN), NA_real_)
  data.frame(Accuracy = acc, Precision_Yes = prec, Recall_Yes = rec, ROC_AUC = auc,
             TP = TP, FP = FP, FN = FN, TN = TN, stringsAsFactors = FALSE)
}

results <- tibble()

Decision Tree — Experiment 1 (Shallow Tree)

 ctrl <- rpart.control(maxdepth = 3, cp = 0.0, minsplit = 20)
dt1 <- rpart::rpart(y ~ ., data = train, method = "class", control = ctrl)

prob <- predict(dt1, newdata = test, type = "prob")[,"yes"]
pred <- ifelse(prob >= 0.5, "yes", "no") %>% factor(levels = c("no","yes"))

res_dt1 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "Decision Tree", Experiment = "Depth=3")

## Setting direction: controls < cases

res_dt1

##    Accuracy Precision_Yes Recall_Yes   ROC_AUC  TP FP  FN   TN     Algorithm
## 1 0.9010562     0.7164751  0.2015086 0.7222933 187 74 741 7235 Decision Tree
##   Experiment
## 1    Depth=3

results <- bind_rows(results, res_dt1)

Decision Tree — Experiment 2 (Deeper Tree)

ctrl <- rpart.control(maxdepth = 8, cp = 0.0, minsplit = 20)
dt2 <- rpart::rpart(y ~ ., data = train, method = "class", control = ctrl)

prob <- predict(dt2, newdata = test, type = "prob")[,"yes"]
pred <- ifelse(prob >= 0.5, "yes", "no") %>% factor(levels = c("no","yes"))

res_dt2 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "Decision Tree", Experiment = "Depth=8")

## Setting direction: controls < cases

res_dt2

##   Accuracy Precision_Yes Recall_Yes   ROC_AUC  TP  FP  FN   TN     Algorithm
## 1 0.900085     0.6223776  0.2877155 0.7848777 267 162 661 7147 Decision Tree
##   Experiment
## 1    Depth=8

results <- bind_rows(results, res_dt2)

Random Forest — Experiment 1 (100 Trees)

rf1 <- randomForest::randomForest(y ~ ., data = train, ntree = 100)

prob <- predict(rf1, newdata = test, type = "prob")[,"yes"]
pred <- predict(rf1, newdata = test, type = "response")

res_rf1 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "Random Forest", Experiment = "Trees=100")

## Setting direction: controls < cases

res_rf1

##    Accuracy Precision_Yes Recall_Yes   ROC_AUC  TP  FP  FN   TN     Algorithm
## 1 0.9029987     0.6430155     0.3125 0.8008831 290 161 638 7148 Random Forest
##   Experiment
## 1  Trees=100

results <- bind_rows(results, res_rf1)

Random Forest — Experiment 2 (300 Trees)

rf2 <- randomForest::randomForest(y ~ ., data = train, ntree = 300)

prob <- predict(rf2, newdata = test, type = "prob")[,"yes"]
pred <- predict(rf2, newdata = test, type = "response")

res_rf2 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "Random Forest", Experiment = "Trees=300")

## Setting direction: controls < cases

res_rf2

##    Accuracy Precision_Yes Recall_Yes   ROC_AUC  TP  FP  FN   TN     Algorithm
## 1 0.9029987     0.6449438  0.3092672 0.8062519 287 158 641 7151 Random Forest
##   Experiment
## 1  Trees=300

results <- bind_rows(results, res_rf2)

AdaBoost — Experiment 1 (100 Iterations)

ab1 <- adabag::boosting(y ~ ., data = train, boos = TRUE, mfinal = 100,
                        control = rpart.control(maxdepth = 3))

pred_obj <- predict(ab1, newdata = test)

# Convert votes to Pr(yes)
votes <- pred_obj$votes
colnames(votes) <- levels(train$y)
prob <- votes[,"yes"] / rowSums(votes)

pred <- factor(pred_obj$class, levels = c("no","yes"))

res_ab1 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "AdaBoost", Experiment = "Est=100")

## Setting direction: controls < cases

res_ab1

##    Accuracy Precision_Yes Recall_Yes   ROC_AUC  TP  FP  FN   TN Algorithm
## 1 0.9044555     0.7020057  0.2640086 0.8115768 245 104 683 7205  AdaBoost
##   Experiment
## 1    Est=100

results <- bind_rows(results, res_ab1)

AdaBoost — Experiment 2 (300 Iterations)

ab2 <- adabag::boosting(y ~ ., data = train, boos = TRUE, mfinal = 300,
                        control = rpart.control(maxdepth = 3))

pred_obj <- predict(ab2, newdata = test)
votes <- pred_obj$votes
colnames(votes) <- levels(train$y)
prob <- votes[,"yes"] / rowSums(votes)

pred <- factor(pred_obj$class, levels = c("no","yes"))

res_ab2 <- metrics(test$y, prob, pred) %>% mutate(Algorithm = "AdaBoost", Experiment = "Est=300")

## Setting direction: controls < cases

res_ab2

##    Accuracy Precision_Yes Recall_Yes   ROC_AUC  TP  FP  FN   TN Algorithm
## 1 0.9037271     0.7014925  0.2532328 0.8183554 235 100 693 7209  AdaBoost
##   Experiment
## 1    Est=300

results <- bind_rows(results, res_ab2)

Results Table (+ Confusion Matrix Summary)

results %>%
  mutate(across(c(Accuracy, Precision_Yes, Recall_Yes, ROC_AUC), ~round(.x, 3))) %>%
  mutate(Confusion = paste0("TP=", TP, ", FP=", FP, ", FN=", FN, ", TN=", TN)) %>%
  select(Algorithm, Experiment, Accuracy, Precision_Yes, Recall_Yes, ROC_AUC, Confusion) %>%
  arrange(Algorithm, Experiment) %>%
  knitr::kable(caption = "Experiment Results on Hold-out Test Set")

Experiment Results on Hold-out Test Set
Algorithm	Experiment	Accuracy	Precision_Yes	Recall_Yes	ROC_AUC	Confusion
AdaBoost	Est=100	0.904	0.702	0.264	0.812	TP=245, FP=104, FN=683, TN=7205
AdaBoost	Est=300	0.904	0.701	0.253	0.818	TP=235, FP=100, FN=693, TN=7209
Decision Tree	Depth=3	0.901	0.716	0.202	0.722	TP=187, FP=74, FN=741, TN=7235
Decision Tree	Depth=8	0.900	0.622	0.288	0.785	TP=267, FP=162, FN=661, TN=7147
Random Forest	Trees=100	0.903	0.643	0.312	0.801	TP=290, FP=161, FN=638, TN=7148
Random Forest	Trees=300	0.903	0.645	0.309	0.806	TP=287, FP=158, FN=641, TN=7151

Assignment 2 — Experimentation & Model Training (R)

Michael Robinson