Assignment 2 — Experimentation & Model Training

1) Data import & cleaning
2) Stratified Train/Test Split + Factor-Level Harmonization
3) Metric Helper (Accuracy, Precision(Yes), Recall(Yes), ROC AUC)
4) Decision Tree — Experiment 1
5) Decision Tree — Experiment 2
6) Random Forest — Experiment 1 (100 Trees)
7) Random Forest — Experiment 2 (300 Trees)
8) AdaBoost — Experiment 1 (100 Rounds, depth=3)
9) AdaBoost — Experiment 2 (300 Rounds, depth=3)
10) Results Table

set.seed(42)
options(dplyr.summarise.inform = FALSE)

library(tidyverse)
library(janitor)
library(caret)
library(pROC)       
library(rpart)      
library(randomForest)
library(adabag)     
library(knitr)

theme_set(theme_minimal())

1) Data import & cleaning

file_path <- "/Users/michaelrobinson/Downloads/bank+marketing/bank-additional/bank-additional-full.csv"

bank <- read.csv(file_path, sep = ";", header = TRUE, stringsAsFactors = FALSE) |>
  clean_names() |>
  mutate(y = factor(y, levels = c("no","yes"))) |>
  dplyr::select(-dplyr::any_of("duration")) |>
  mutate(across(where(is.character), as.factor))

glimpse(bank)

## Rows: 41,188
## Columns: 20
## $ age            <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job            <fct> housemaid, services, services, admin., services, servic…
## $ marital        <fct> married, married, married, married, married, married, m…
## $ education      <fct> basic.4y, high.school, high.school, basic.6y, high.scho…
## $ default        <fct> no, unknown, no, no, no, unknown, no, unknown, no, no, …
## $ housing        <fct> no, no, yes, no, no, no, no, no, yes, yes, no, yes, no,…
## $ loan           <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes, n…
## $ contact        <fct> telephone, telephone, telephone, telephone, telephone, …
## $ month          <fct> may, may, may, may, may, may, may, may, may, may, may, …
## $ day_of_week    <fct> mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, …
## $ campaign       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays          <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome       <fct> nonexistent, nonexistent, nonexistent, nonexistent, non…
## $ emp_var_rate   <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons_price_idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons_conf_idx  <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m      <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr_employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y              <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,…

table(bank$y)

## 
##    no   yes 
## 36548  4640

sum(is.na(bank))

## [1] 0

2) Stratified Train/Test Split + Factor-Level Harmonization

Purpose:
- Creates an 80/20 stratified split (keeps yes/no ratio stable).
- Ensures test factor levels match training (avoids “new levels” prediction errors).
- If a novel level appears in test, it is mapped to "unknown" (or the first level if "unknown" is unavailable).

set.seed(42)
idx  <- caret::createDataPartition(bank$y, p = 0.80, list = FALSE)
train <- bank[idx, ]
test  <- bank[-idx, ]

factor_cols <- setdiff(names(train)[sapply(train, is.factor)], "y")

for (col in factor_cols) {
  allowed <- levels(train[[col]])
  if (is.null(allowed)) next
  
  novel_idx <- !(test[[col]] %in% allowed)
  if (any(novel_idx, na.rm = TRUE)) {
    fallback <- if ("unknown" %in% allowed) "unknown" else allowed[1]
    test[[col]][novel_idx] <- fallback
  }

    test[[col]] <- factor(test[[col]], levels = allowed)
}

3) Metric Helper (Accuracy, Precision(Yes), Recall(Yes), ROC AUC)

Purpose:
- Centralizes metric computation.
- Uses pROC for AUC with the correct positive class.
- Returns counts (TP/FP/FN/TN) for confusion summaries.
- Uses a consistent 0.5 threshold when you pass pred_class.

metrics <- function(y_true, prob_yes, pred_class){
  
  y_true <- factor(y_true, levels = c("no","yes"))
  pred_class <- factor(pred_class, levels = c("no","yes"))
  
  auc <- as.numeric(pROC::auc(pROC::roc(y_true, prob_yes, levels = c("no","yes"))))
  
 
  cm <- table(truth = y_true, pred = pred_class)
  TP <- ifelse("yes" %in% rownames(cm) && "yes" %in% colnames(cm), cm["yes","yes"], 0)
  FP <- ifelse("no"  %in% rownames(cm) && "yes" %in% colnames(cm), cm["no","yes"], 0)
  FN <- ifelse("yes" %in% rownames(cm) && "no"  %in% colnames(cm), cm["yes","no"], 0)
  TN <- ifelse("no"  %in% rownames(cm) && "no"  %in% colnames(cm), cm["no","no"], 0)
  
  acc  <- (TP + TN) / sum(cm)
  prec <- ifelse((TP + FP) > 0, TP / (TP + FP), NA_real_)
  rec  <- ifelse((TP + FN) > 0, TP / (TP + FN), NA_real_)
  
  data.frame(
    Accuracy = acc,
    Precision_Yes = prec,
    Recall_Yes = rec,
    ROC_AUC = auc,
    TP = TP, FP = FP, FN = FN, TN = TN,
    stringsAsFactors = FALSE
  )
}

4) Decision Tree — Experiment 1

Purpose: A shallower tree (maxdepth=3) should generalize better (less variance), likely yielding higher precision but lower recall vs deeper trees.

results <- tibble()

ctrl <- rpart.control(maxdepth = 3, cp = 0.0, minsplit = 20)
dt1  <- rpart::rpart(y ~ ., data = train, method = "class", control = ctrl)

prob <- predict(dt1, newdata = test, type = "prob")[,"yes"]
pred <- factor(ifelse(prob >= 0.5, "yes", "no"), levels = c("no","yes"))

res_dt1 <- metrics(test$y, prob, pred) |>
  mutate(Algorithm = "Decision Tree", Experiment = "Depth=3")

## Setting direction: controls < cases

res_dt1

results <- bind_rows(results, res_dt1)

5) Decision Tree — Experiment 2

Purpose: A deeper tree (maxdepth=8) should increase recall and AUC (lower bias), possibly with some precision loss.

ctrl <- rpart.control(maxdepth = 8, cp = 0.0, minsplit = 20)
dt2  <- rpart::rpart(y ~ ., data = train, method = "class", control = ctrl)

prob <- predict(dt2, newdata = test, type = "prob")[,"yes"]
pred <- factor(ifelse(prob >= 0.5, "yes", "no"), levels = c("no","yes"))

res_dt2 <- metrics(test$y, prob, pred) |>
  mutate(Algorithm = "Decision Tree", Experiment = "Depth=8")

## Setting direction: controls < cases

res_dt2

results <- bind_rows(results, res_dt2)

6) Random Forest — Experiment 1 (100 Trees)

Purpose: Bagging many trees reduces variance. With 100 trees, expect better AUC/recall than a single tree while keeping accuracy and precision competitive.

rf1 <- randomForest::randomForest(y ~ ., data = train, ntree = 100)

prob <- predict(rf1, newdata = test, type = "prob")[,"yes"]
pred <- predict(rf1, newdata = test, type = "response")

res_rf1 <- metrics(test$y, prob, pred) |>
  mutate(Algorithm = "Random Forest", Experiment = "Trees=100")

## Setting direction: controls < cases

res_rf1

results <- bind_rows(results, res_rf1)

7) Random Forest — Experiment 2 (300 Trees)

Purpose: Increasing to 300 trees should slightly improve or stabilize AUC (law of large numbers for bagging) with similar precision/recall.

rf2 <- randomForest::randomForest(y ~ ., data = train, ntree = 300)

prob <- predict(rf2, newdata = test, type = "prob")[,"yes"]
pred <- predict(rf2, newdata = test, type = "response")

res_rf2 <- metrics(test$y, prob, pred) |>
  mutate(Algorithm = "Random Forest", Experiment = "Trees=300")

## Setting direction: controls < cases

res_rf2

results <- bind_rows(results, res_rf2)

8) AdaBoost — Experiment 1 (100 Rounds, depth=3)

Purpose: With 100 boosting rounds and maxdepth=3 weak learners, expect higher AUC (lower bias) and strong precision at a 0.5 threshold.

ab1 <- adabag::boosting(
  y ~ ., data = train, boos = TRUE, mfinal = 100,
  control = rpart.control(maxdepth = 3)
)

pred_obj <- predict(ab1, newdata = test)

votes <- pred_obj$votes
colnames(votes) <- levels(train$y)
prob <- votes[,"yes"] / rowSums(votes)

pred <- factor(pred_obj$class, levels = c("no","yes"))

res_ab1 <- metrics(test$y, prob, pred) |>
  mutate(Algorithm = "AdaBoost", Experiment = "Est=100")

## Setting direction: controls < cases

res_ab1

results <- bind_rows(results, res_ab1)

9) AdaBoost — Experiment 2 (300 Rounds, depth=3)

Purpose: More rounds (300) should further improve AUC (best ranking) while precision stays high; recall may fluctuate at a fixed 0.5 threshold.

ab2 <- adabag::boosting(
  y ~ ., data = train, boos = TRUE, mfinal = 300,
  control = rpart.control(maxdepth = 3)
)

pred_obj <- predict(ab2, newdata = test)
votes <- pred_obj$votes
colnames(votes) <- levels(train$y)
prob <- votes[,"yes"] / rowSums(votes)

pred <- factor(pred_obj$class, levels = c("no","yes"))

res_ab2 <- metrics(test$y, prob, pred) |>
  mutate(Algorithm = "AdaBoost", Experiment = "Est=300")

## Setting direction: controls < cases

res_ab2

results <- bind_rows(results, res_ab2)

10) Results Table

Purpose:
- Rounds key metrics for readability.
- Adds a compact confusion matrix string.
- Presents all six experiments in a single table (rubric-friendly).

results |>
  mutate(across(c(Accuracy, Precision_Yes, Recall_Yes, ROC_AUC), ~round(.x, 3))) |>
  mutate(Confusion = paste0("TP=", TP, ", FP=", FP, ", FN=", FN, ", TN=", TN)) |>
  select(Algorithm, Experiment, Accuracy, Precision_Yes, Recall_Yes, ROC_AUC, Confusion) |>
  arrange(Algorithm, Experiment) |>
  knitr::kable(caption = "Experiment Results")

Experiment Results
Algorithm	Experiment	Accuracy	Precision_Yes	Recall_Yes	ROC_AUC	Confusion
AdaBoost	Est=100	0.904	0.702	0.264	0.812	TP=245, FP=104, FN=683, TN=7205
AdaBoost	Est=300	0.904	0.701	0.253	0.818	TP=235, FP=100, FN=693, TN=7209
Decision Tree	Depth=3	0.901	0.716	0.202	0.722	TP=187, FP=74, FN=741, TN=7235
Decision Tree	Depth=8	0.900	0.622	0.288	0.785	TP=267, FP=162, FN=661, TN=7147
Random Forest	Trees=100	0.903	0.643	0.312	0.801	TP=290, FP=161, FN=638, TN=7148
Random Forest	Trees=300	0.903	0.645	0.309	0.806	TP=287, FP=158, FN=641, TN=7151