Load Packages

library(tidyverse)
library(caret)
library(ranger)
library(randomForest)
library(pROC)
library(rpart)
library(rpart.plot)
library(ranger)
library(doParallel)
library(rsample)
library(ada)

set.seed(622)

Load Data

df <- read_delim("bank-full.csv", delim = ";",
                   escape_double = FALSE, trim_ws = TRUE) |> 
  rename(target = y)

Decision Tree Experimentation

Prep data

# Coerce target data to factor
df$target <- factor(df$target) 



# Create a split that keeps both classes in train and test
make_split_with_both_classes <- function(dat, p = 0.8, strata = "target", max_tries = 100) {
  for (i in 1:max_tries) {
    idx <- createDataPartition(dat[[strata]], p = p, list = FALSE)
    train <- dat[idx, , drop = FALSE]
    test  <- dat[-idx, , drop = FALSE]
    if (nlevels(droplevels(train[[strata]])) == 2 &&
        nlevels(droplevels(test[[strata]]))  == 2) {
      return(list(train = train, test = test))
    }
  }
}

splits <- make_split_with_both_classes(df, p = 0.8, strata = "target")
train <- splits$train
test  <- splits$test

# Create function to organize result metrics
metrics <- function(y_true, y_pred_class, y_pred_prob = NULL) {
  y_true <- droplevels(y_true)
  lv <- levels(y_true)

  # Confusion-matrix-based metrics for two levels
  if (nlevels(y_true) == 2) {
    positive <- lv[2]
    tp <- sum(y_true == positive & y_pred_class == positive)
    fp <- sum(y_true != positive & y_pred_class == positive)
    fn <- sum(y_true == positive & y_pred_class != positive)
    tn <- sum(y_true != positive & y_pred_class != positive)

    accuracy  <- (tp + tn) / (tp + tn + fp + fn)
    precision <- ifelse(tp + fp == 0, NA, tp / (tp + fp))
    recall    <- ifelse(tp + fn == 0, NA, tp / (tp + fn))
    f1        <- ifelse(is.na(precision) | is.na(recall) | (precision + recall == 0),
                        NA, 2 * precision * recall / (precision + recall))

    # AUC only if we have a prob column for the positive class
    auc <- NA
    if (!is.null(y_pred_prob)) {
      auc <- as.numeric(roc(y_true, y_pred_prob, levels = lv, quiet = TRUE)$auc)
    }

    return(data.frame(accuracy, precision, recall, f1, auc, check.names = FALSE))
  } else {
    accuracy <- mean(y_pred_class == y_true)
    return(data.frame(accuracy = accuracy,
                      precision = NA, recall = NA, f1 = NA, auc = NA,
                      check.names = FALSE))
  }
}


fold_ids <- createFolds(train$target, k = 5, returnTrain = TRUE)
ctrl_shared <- trainControl(
  method = "cv",
  index = fold_ids,
  classProbs = TRUE,
  summaryFunction = twoClassSummary, # optimize ROC
  savePredictions = "final"
)


pos_level <- tail(levels(train$target), 1)

First Model

m_base <- train(
  target ~ ., data = train,
  method = "rpart",
  trControl = ctrl_shared,
  metric = "ROC",
  tuneLength = 20
)

Second Model

m_depth <- train(
  target ~ ., data = train,
  method = "rpart2",  # tunes maxdepth instead of cp
  trControl = ctrl_shared,
  metric = "ROC",
  tuneLength = 15     
)
## note: only 8 possible values of the max tree depth from the initial fit.
##  Truncating the grid to 8 .

Evaluation Metrics

# Evaluate both models on same test set
p_base_prob <- predict(m_base,  newdata = test, type = "prob")[, pos_level, drop = TRUE]
p_base_cls  <- predict(m_base,  newdata = test)
m_base_metrics <- metrics(test$target, p_base_cls, p_base_prob)
m_base_metrics$model <- "Tuned (cp)"

p_depth_prob <- predict(m_depth, newdata = test, type = "prob")[, pos_level, drop = TRUE]
p_depth_cls  <- predict(m_depth, newdata = test)
m_depth_metrics <- metrics(test$target, p_depth_cls, p_depth_prob)
m_depth_metrics$model <- "Tuned (maxdepth)"

results <- rbind(
  m_base_metrics[, c("model","accuracy","precision","recall","f1","auc")],
  m_depth_metrics[, c("model","accuracy","precision","recall","f1","auc")]
)
print(results)
##              model  accuracy precision    recall        f1       auc
## 1       Tuned (cp) 0.9074217 0.6540616 0.4418165 0.5273857 0.8639102
## 2 Tuned (maxdepth) 0.9013383 0.6565465 0.3273415 0.4368687 0.7490713

Visualize Trees

rpart.plot(m_base$finalModel,  main = sprintf("Base (cp): cp=%.4f", m_base$bestTune$cp))
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

rpart.plot(m_depth$finalModel, main = sprintf("Variant (maxdepth): depth=%d", m_depth$bestTune$maxdepth))

Random Forest Experimentation

Prep Data

# Positive class FIRST (for consistent AUC direction later)
pos <- tail(levels(df$target), 1)
neg <- setdiff(levels(df$target), pos)[1]
df$target <- factor(df$target, levels = c(pos, neg))  # positive first

# Light predictor cleanup
char_cols <- names(df)[sapply(df, is.character)]
for (cc in char_cols) df[[cc]] <- as.factor(df[[cc]])


# Train/Test split
spl <- initial_split(df, prop = 0.8, strata = target)
train <- training(spl)
test  <- testing(spl)

pos_level <- levels(train$target)[1]  # positive is first


safe_auc <- function(y_true, y_prob_pos, pos_label = pos_level) {
  y_true <- droplevels(y_true)
  if (nlevels(y_true) < 2) return(NA_real_)
  # pROC expects levels=c(neg,pos); we already have pos first, so reverse:
  as.numeric(roc(response = y_true, predictor = y_prob_pos,
                 levels = rev(levels(y_true)), quiet = TRUE)$auc)
}

safe_acc <- function(y_true, y_pred) mean(y_true == y_pred)

Prep Models

# one-fit-one-predict for ranger (classification)
fit_rf <- function(form, data, num.trees = 200, mtry = NULL,
                   min.node.size = 10, max.depth = 20, sample.fraction = 0.7) {
  ranger(
    formula = form,
    data = data,
    num.trees = num.trees,
    mtry = mtry,
    min.node.size = min.node.size,
    max.depth = max.depth,
    sample.fraction = sample.fraction,
    probability = TRUE,   # we need class probs for ROC
    classification = TRUE,
    respect.unordered.factors = "order",  # fast, consistent
    num.threads = max(1, parallel::detectCores() - 1),
    seed = 42,
    verbose = FALSE
  )
}

# Two different RF models
p <- ncol(train) - 1
mtry_A <- max(1, round(sqrt(p)))
mtry_B <- max(1, round(0.25 * p))

cfg_A <- list(num.trees = 200, mtry = mtry_A, min.node.size = 10, max.depth = 20, sample.fraction = 0.7)
cfg_B <- list(num.trees = 300, mtry = mtry_B, min.node.size = 15, max.depth = 20, sample.fraction = 0.7)

form <- as.formula("target ~ .")

folds <- vfold_cv(train, v = 3, strata = target)

cv_eval <- function(cfg) {
  aucs <- c(); accs <- c()
  for (i in seq_len(nrow(folds))) {
    tr_ix <- folds$splits[[i]]$in_id
    tr <- train[tr_ix, , drop = FALSE]
    va <- assessment(folds$splits[[i]])

    # Fit on fold-train
    fit <- fit_rf(form, tr, num.trees = cfg$num.trees, mtry = cfg$mtry,
                  min.node.size = cfg$min.node.size, max.depth = cfg$max.depth,
                  sample.fraction = cfg$sample.fraction)

    # Predict on fold-assessment
    prob <- predict(fit, data = va, type = "response")$predictions[, pos_level]
    cls  <- ifelse(prob >= 0.5, pos_level, levels(train$target)[2])
    cls  <- factor(cls, levels = levels(train$target))

    # Metrics (skip AUC if single-class assessment)
    aucs[i] <- safe_auc(va$target, prob, pos_label = pos_level)
    accs[i] <- safe_acc(va$target, cls)
  }
  c(
    auc_mean = mean(aucs, na.rm = TRUE),
    auc_sd   = sd(aucs,  na.rm = TRUE),
    acc_mean = mean(accs),
    acc_sd   = sd(accs)
  )
}

cv_A <- cv_eval(cfg_A)
cv_B <- cv_eval(cfg_B)

cat("\n=== CV Results ===\n")
## 
## === CV Results ===
print(rbind(
  `RF-A (mtry ~ sqrt(p))` = cv_A,
  `RF-B (mtry ~ 0.25*p, larger leaves)` = cv_B
))
##                                      auc_mean      auc_sd  acc_mean
## RF-A (mtry ~ sqrt(p))               0.9299809 0.001734804 0.9074596
## RF-B (mtry ~ 0.25*p, larger leaves) 0.9302337 0.002245681 0.9067131
##                                           acc_sd
## RF-A (mtry ~ sqrt(p))               0.0008046334
## RF-B (mtry ~ 0.25*p, larger leaves) 0.0005903118

Fit Models

fit_A <- fit_rf(form, train, num.trees = cfg_A$num.trees, mtry = cfg_A$mtry,
                min.node.size = cfg_A$min.node.size, max.depth = cfg_A$max.depth,
                sample.fraction = cfg_A$sample.fraction)

fit_B <- fit_rf(form, train, num.trees = cfg_B$num.trees, mtry = cfg_B$mtry,
                min.node.size = cfg_B$min.node.size, max.depth = cfg_B$max.depth,
                sample.fraction = cfg_B$sample.fraction)

prob_A <- predict(fit_A, data = test, type = "response")$predictions[, pos_level]
cls_A  <- factor(ifelse(prob_A >= 0.5, pos_level, levels(train$target)[2]),
                 levels = levels(train$target))

prob_B <- predict(fit_B, data = test, type = "response")$predictions[, pos_level]
cls_B  <- factor(ifelse(prob_B >= 0.5, pos_level, levels(train$target)[2]),
                 levels = levels(train$target))

Evaluation Metrics

test_metrics <- function(lbl) {
  function(prob, cls) {
    data.frame(
      model    = lbl,
      accuracy = safe_acc(test$target, cls),
      auc      = safe_auc(test$target, prob, pos_label = pos_level),
      precision = {
        positive <- pos_level
        tp <- sum(test$target == positive & cls == positive)
        fp <- sum(test$target != positive & cls == positive)
        if (tp + fp == 0) NA_real_ else tp/(tp+fp)
      },
      recall = {
        positive <- pos_level
        tp <- sum(test$target == positive & cls == positive)
        fn <- sum(test$target == positive & cls != positive)
        if (tp + fn == 0) NA_real_ else tp/(tp+fn)
      }
    )
  }
}

met_A <- test_metrics("RF-A")(prob_A, cls_A)
met_B <- test_metrics("RF-B")(prob_B, cls_B)

cat("\n=== Test Results ===\n")
## 
## === Test Results ===
print(rbind(met_A, met_B))
##   model  accuracy       auc precision    recall
## 1  RF-A 0.9077740 0.9331561 0.6637427 0.4291115
## 2  RF-B 0.9075528 0.9330660 0.6632353 0.4262760

Variable Importance

#Refit both models with variable importance enabled
fit_A_imp <- ranger(
  formula = form, data = train,
  num.trees = cfg_A$num.trees,
  mtry = cfg_A$mtry,
  min.node.size = cfg_A$min.node.size,
  max.depth = cfg_A$max.depth,
  sample.fraction = cfg_A$sample.fraction,
  probability = TRUE, classification = TRUE,
  respect.unordered.factors = "order",
  importance = "impurity",        # <--- change to "permutation" for permutation importance
  num.threads = max(1, parallel::detectCores() - 1),
  seed = 42
)

fit_B_imp <- ranger(
  formula = form, data = train,
  num.trees = cfg_B$num.trees,
  mtry = cfg_B$mtry,
  min.node.size = cfg_B$min.node.size,
  max.depth = cfg_B$max.depth,
  sample.fraction = cfg_B$sample.fraction,
  probability = TRUE, classification = TRUE,
  respect.unordered.factors = "order",
  importance = "impurity",        # <--- or "permutation"
  num.threads = max(1, parallel::detectCores() - 1),
  seed = 42
)

imp_A <- sort(fit_A_imp$variable.importance, decreasing = TRUE)
imp_B <- sort(fit_B_imp$variable.importance, decreasing = TRUE)

imp_A_df <- data.frame(feature = names(imp_A), importance = as.numeric(imp_A), row.names = NULL)
imp_B_df <- data.frame(feature = names(imp_B), importance = as.numeric(imp_B), row.names = NULL)

cat("\nTop 15 features — Model A\n")
## 
## Top 15 features — Model A
print(head(imp_A_df, 15))
##      feature importance
## 1   duration 1271.95532
## 2      month  411.53807
## 3   poutcome  349.76453
## 4    balance  334.53758
## 5        age  328.63157
## 6        day  297.23299
## 7      pdays  197.86732
## 8        job  152.76310
## 9   campaign  112.35607
## 10   housing   91.67150
## 11  previous   86.62537
## 12   contact   83.78486
## 13 education   73.48927
## 14   marital   58.11050
## 15      loan   30.23858
cat("\nTop 15 features — Model B\n")
## 
## Top 15 features — Model B
print(head(imp_B_df, 15))
##      feature importance
## 1   duration 1217.47812
## 2      month  390.20289
## 3   poutcome  350.77981
## 4        age  286.92198
## 5    balance  282.76165
## 6        day  258.84177
## 7      pdays  178.81014
## 8        job  128.56799
## 9   campaign   94.22775
## 10   housing   84.86288
## 11   contact   79.95396
## 12  previous   78.10499
## 13 education   60.27271
## 14   marital   47.58503
## 15      loan   25.39396

Adaboost Experimentation

Prep data

#Train/Test split
idx <- caret::createDataPartition(df$target, p = 0.8, list = FALSE)
train <- df[idx, ]
test  <- df[-idx, ]
pos_level <- levels(train$target)[1]  # positive is FIRST

# CV folds
make_strict_folds <- function(y, k = 3, max_tries = 200) {
  for (i in 1:max_tries) {
    f <- caret::createFolds(y, k = k, returnTrain = TRUE)
    ok <- all(vapply(f, function(tr) length(unique(y[-tr])) == 2, logical(1)))
    if (ok) return(f)
  }
}
fold_ids <- make_strict_folds(train$target, k = 3)


safe_auc <- function(y_true, y_prob_pos) {
  y_true <- droplevels(y_true)
  if (nlevels(y_true) < 2) return(NA_real_)
  as.numeric(roc(y_true, y_prob_pos, levels = rev(levels(y_true)), quiet = TRUE)$auc)
}
safe_acc <- function(y_true, y_pred) mean(y_true == y_pred)

metrics_full <- function(y_true, y_pred, y_prob_pos) {
  y_true <- droplevels(y_true)
  positive <- levels(y_true)[1]
  tp <- sum(y_true == positive & y_pred == positive)
  fp <- sum(y_true != positive & y_pred == positive)
  fn <- sum(y_true == positive & y_pred != positive)
  tn <- sum(y_true != positive & y_pred != positive)
  acc <- (tp + tn) / (tp + tn + fp + fn)
  pre <- ifelse(tp + fp == 0, NA, tp / (tp + fp))
  rec <- ifelse(tp + fn == 0, NA, tp / (tp + fn))
  f1  <- ifelse(is.na(pre) | is.na(rec) | (pre + rec == 0), NA, 2 * pre * rec / (pre + rec))
  auc <- safe_auc(y_true, y_prob_pos)
  data.frame(accuracy = acc, precision = pre, recall = rec, f1 = f1, auc = auc, check.names = FALSE)
}

Model Experimentation

# Model configurations
form <- target ~ .
cfg_A <- list(iter = 150, depth = 2)   # shallower, fewer rounds
cfg_B <- list(iter = 250, depth = 3)   # slightly deeper, more rounds

fit_ada <- function(cfg, data) {
  ada(
    formula = form,
    data = data,
    iter = cfg$iter,
    nu = 1,                                  # learning rate
    control = rpart.control(maxdepth = cfg$depth, cp = 0)
  )
}

# Modify predictor for ada
predict_ada <- function(model, newdata, pos_level) {
  # Try to get both classes + probabilities
  pr_both <- tryCatch(predict(model, newdata, type = "both"), error = function(e) NULL)

  if (!is.null(pr_both)) {
    prob <- pr_both$prob
    cls  <- factor(pr_both$class, levels = levels(newdata$target))
  } else {
    prob <- tryCatch(predict(model, newdata, type = "prob"),   error = function(e) NULL)
    cls  <- factor(tryCatch(predict(model, newdata, type = "vector"), error = function(e) NA),
                   levels = levels(newdata$target))
  }

  # Coerce to matrix and ensure column names
  if (is.null(prob)) {
    prob_pos <- rep(NA_real_, nrow(newdata))
  } else {
    prob_mat <- as.matrix(prob)

    if (is.null(colnames(prob_mat)) && ncol(prob_mat) == length(levels(newdata$target))) {
      colnames(prob_mat) <- levels(newdata$target)
    }

    if (!is.null(colnames(prob_mat)) && pos_level %in% colnames(prob_mat)) {
      prob_pos <- prob_mat[, pos_level]
    } else if (ncol(prob_mat) == 2) {
      # Fall back: positive is FIRST factor level per our setup
      prob_pos <- prob_mat[, 1]
    } else if (is.vector(prob) && length(prob) == nrow(newdata)) {
      prob_pos <- as.numeric(prob)
    } else {
      prob_pos <- rep(NA_real_, nrow(newdata))
    }
  }

  list(prob_pos = prob_pos, cls = cls)
}

# 3-fold CV
cv_eval <- function(cfg) {
  aucs <- c(); accs <- c()
  for (i in seq_along(fold_ids)) {
    tr_ix <- fold_ids[[i]]
    tr <- train[tr_ix, ]
    va <- train[-tr_ix, ]

    fit  <- fit_ada(cfg, tr)
    pred <- predict_ada(fit, va, pos_level)

    aucs[i] <- safe_auc(va$target, pred$prob_pos)
    accs[i] <- safe_acc(va$target, pred$cls)
  }
  c(AUC_mean = mean(aucs, na.rm=TRUE), AUC_sd = sd(aucs, na.rm=TRUE),
    ACC_mean = mean(accs), ACC_sd = sd(accs))
}

Evaluation

# CV results
cv_A <- cv_eval(cfg_A)
cv_B <- cv_eval(cfg_B)

cat("\n=== CV Results (AdaBoost via `ada`) ===\n")
## 
## === CV Results (AdaBoost via `ada`) ===
print(rbind(`ADA-A` = cv_A, `ADA-B` = cv_B))
##        AUC_mean      AUC_sd  ACC_mean       ACC_sd
## ADA-A 0.9230700 0.006310872 0.9033176 0.0028190190
## ADA-B 0.9246751 0.002834275 0.9037047 0.0002239786
# Final Metrics
fit_A <- fit_ada(cfg_A, train)
fit_B <- fit_ada(cfg_B, train)

pred_A <- predict_ada(fit_A, test, pos_level)
pred_B <- predict_ada(fit_B, test, pos_level)

met_A <- metrics_full(test$target, pred_A$cls, pred_A$prob_pos); met_A$model <- "ADA-A"
met_B <- metrics_full(test$target, pred_B$cls, pred_B$prob_pos); met_B$model <- "ADA-B"

cat("\n=== Test Results ===\n")
## 
## === Test Results ===
print(rbind(
  met_A[, c("model","accuracy","precision","recall","f1","auc")],
  met_B[, c("model","accuracy","precision","recall","f1","auc")]
))
##   model  accuracy precision    recall        f1       auc
## 1 ADA-A 0.9014490 0.6155989 0.4181646 0.4980282 0.9185975
## 2 ADA-B 0.9034399 0.6167513 0.4597919 0.5268293 0.9235401