Load Packages

library(tidyverse)
library(caret)
library(ranger)
library(randomForest)
library(pROC)
library(rpart)
library(rpart.plot)
library(ranger)
library(doParallel)
library(rsample)
library(ada)

set.seed(622)

Load Data

df <- read_delim("bank-full.csv", delim = ";",
                   escape_double = FALSE, trim_ws = TRUE) |> 
  rename(target = y)

Decision Tree Experimentation

Prep data

# Coerce target data to factor
df$target <- factor(df$target) 



# Create a split that keeps both classes in train and test
make_split_with_both_classes <- function(dat, p = 0.8, strata = "target", max_tries = 100) {
  for (i in 1:max_tries) {
    idx <- createDataPartition(dat[[strata]], p = p, list = FALSE)
    train <- dat[idx, , drop = FALSE]
    test  <- dat[-idx, , drop = FALSE]
    if (nlevels(droplevels(train[[strata]])) == 2 &&
        nlevels(droplevels(test[[strata]]))  == 2) {
      return(list(train = train, test = test))
    }
  }
}

splits <- make_split_with_both_classes(df, p = 0.8, strata = "target")
train <- splits$train
test  <- splits$test

# Create function to organize result metrics
metrics <- function(y_true, y_pred_class, y_pred_prob = NULL) {
  y_true <- droplevels(y_true)
  lv <- levels(y_true)

  # Confusion-matrix-based metrics for two levels
  if (nlevels(y_true) == 2) {
    positive <- lv[2]
    tp <- sum(y_true == positive & y_pred_class == positive)
    fp <- sum(y_true != positive & y_pred_class == positive)
    fn <- sum(y_true == positive & y_pred_class != positive)
    tn <- sum(y_true != positive & y_pred_class != positive)

    accuracy  <- (tp + tn) / (tp + tn + fp + fn)
    precision <- ifelse(tp + fp == 0, NA, tp / (tp + fp))
    recall    <- ifelse(tp + fn == 0, NA, tp / (tp + fn))
    f1        <- ifelse(is.na(precision) | is.na(recall) | (precision + recall == 0),
                        NA, 2 * precision * recall / (precision + recall))

    # AUC only if we have a prob column for the positive class
    auc <- NA
    if (!is.null(y_pred_prob)) {
      auc <- as.numeric(roc(y_true, y_pred_prob, levels = lv, quiet = TRUE)$auc)
    }

    return(data.frame(accuracy, precision, recall, f1, auc, check.names = FALSE))
  } else {
    accuracy <- mean(y_pred_class == y_true)
    return(data.frame(accuracy = accuracy,
                      precision = NA, recall = NA, f1 = NA, auc = NA,
                      check.names = FALSE))
  }
}


fold_ids <- createFolds(train$target, k = 5, returnTrain = TRUE)
ctrl_shared <- trainControl(
  method = "cv",
  index = fold_ids,
  classProbs = TRUE,
  summaryFunction = twoClassSummary, # optimize ROC
  savePredictions = "final"
)


pos_level <- tail(levels(train$target), 1)

First Model

m_base <- train(
  target ~ ., data = train,
  method = "rpart",
  trControl = ctrl_shared,
  metric = "ROC",
  tuneLength = 20
)

Second Model

m_depth <- train(
  target ~ ., data = train,
  method = "rpart2",  # tunes maxdepth instead of cp
  trControl = ctrl_shared,
  metric = "ROC",
  tuneLength = 15     
)

## note: only 8 possible values of the max tree depth from the initial fit.
##  Truncating the grid to 8 .

Evaluation Metrics

# Evaluate both models on same test set
p_base_prob <- predict(m_base,  newdata = test, type = "prob")[, pos_level, drop = TRUE]
p_base_cls  <- predict(m_base,  newdata = test)
m_base_metrics <- metrics(test$target, p_base_cls, p_base_prob)
m_base_metrics$model <- "Tuned (cp)"

p_depth_prob <- predict(m_depth, newdata = test, type = "prob")[, pos_level, drop = TRUE]
p_depth_cls  <- predict(m_depth, newdata = test)
m_depth_metrics <- metrics(test$target, p_depth_cls, p_depth_prob)
m_depth_metrics$model <- "Tuned (maxdepth)"

results <- rbind(
  m_base_metrics[, c("model","accuracy","precision","recall","f1","auc")],
  m_depth_metrics[, c("model","accuracy","precision","recall","f1","auc")]
)
print(results)

##              model  accuracy precision    recall        f1       auc
## 1       Tuned (cp) 0.9074217 0.6540616 0.4418165 0.5273857 0.8639102
## 2 Tuned (maxdepth) 0.9013383 0.6565465 0.3273415 0.4368687 0.7490713

Visualize Trees

rpart.plot(m_base$finalModel,  main = sprintf("Base (cp): cp=%.4f", m_base$bestTune$cp))

## Warning: labs do not fit even at cex 0.15, there may be some overplotting

rpart.plot(m_depth$finalModel, main = sprintf("Variant (maxdepth): depth=%d", m_depth$bestTune$maxdepth))

Random Forest Experimentation

Prep Data

# Positive class FIRST (for consistent AUC direction later)
pos <- tail(levels(df$target), 1)
neg <- setdiff(levels(df$target), pos)[1]
df$target <- factor(df$target, levels = c(pos, neg))  # positive first

# Light predictor cleanup
char_cols <- names(df)[sapply(df, is.character)]
for (cc in char_cols) df[[cc]] <- as.factor(df[[cc]])


# Train/Test split
spl <- initial_split(df, prop = 0.8, strata = target)
train <- training(spl)
test  <- testing(spl)

pos_level <- levels(train$target)[1]  # positive is first


safe_auc <- function(y_true, y_prob_pos, pos_label = pos_level) {
  y_true <- droplevels(y_true)
  if (nlevels(y_true) < 2) return(NA_real_)
  # pROC expects levels=c(neg,pos); we already have pos first, so reverse:
  as.numeric(roc(response = y_true, predictor = y_prob_pos,
                 levels = rev(levels(y_true)), quiet = TRUE)$auc)
}

safe_acc <- function(y_true, y_pred) mean(y_true == y_pred)

Prep Models

# one-fit-one-predict for ranger (classification)
fit_rf <- function(form, data, num.trees = 200, mtry = NULL,
                   min.node.size = 10, max.depth = 20, sample.fraction = 0.7) {
  ranger(
    formula = form,
    data = data,
    num.trees = num.trees,
    mtry = mtry,
    min.node.size = min.node.size,
    max.depth = max.depth,
    sample.fraction = sample.fraction,
    probability = TRUE,   # we need class probs for ROC
    classification = TRUE,
    respect.unordered.factors = "order",  # fast, consistent
    num.threads = max(1, parallel::detectCores() - 1),
    seed = 42,
    verbose = FALSE
  )
}

# Two different RF models
p <- ncol(train) - 1
mtry_A <- max(1, round(sqrt(p)))
mtry_B <- max(1, round(0.25 * p))

cfg_A <- list(num.trees = 200, mtry = mtry_A, min.node.size = 10, max.depth = 20, sample.fraction = 0.7)
cfg_B <- list(num.trees = 300, mtry = mtry_B, min.node.size = 15, max.depth = 20, sample.fraction = 0.7)

form <- as.formula("target ~ .")

folds <- vfold_cv(train, v = 3, strata = target)

cv_eval <- function(cfg) {
  aucs <- c(); accs <- c()
  for (i in seq_len(nrow(folds))) {
    tr_ix <- folds$splits[[i]]$in_id
    tr <- train[tr_ix, , drop = FALSE]
    va <- assessment(folds$splits[[i]])

    # Fit on fold-train
    fit <- fit_rf(form, tr, num.trees = cfg$num.trees, mtry = cfg$mtry,
                  min.node.size = cfg$min.node.size, max.depth = cfg$max.depth,
                  sample.fraction = cfg$sample.fraction)

    # Predict on fold-assessment
    prob <- predict(fit, data = va, type = "response")$predictions[, pos_level]
    cls  <- ifelse(prob >= 0.5, pos_level, levels(train$target)[2])
    cls  <- factor(cls, levels = levels(train$target))

    # Metrics (skip AUC if single-class assessment)
    aucs[i] <- safe_auc(va$target, prob, pos_label = pos_level)
    accs[i] <- safe_acc(va$target, cls)
  }
  c(
    auc_mean = mean(aucs, na.rm = TRUE),
    auc_sd   = sd(aucs,  na.rm = TRUE),
    acc_mean = mean(accs),
    acc_sd   = sd(accs)
  )
}

cv_A <- cv_eval(cfg_A)
cv_B <- cv_eval(cfg_B)

cat("\n=== CV Results ===\n")

## 
## === CV Results ===

print(rbind(
  `RF-A (mtry ~ sqrt(p))` = cv_A,
  `RF-B (mtry ~ 0.25*p, larger leaves)` = cv_B
))

##                                      auc_mean      auc_sd  acc_mean
## RF-A (mtry ~ sqrt(p))               0.9299809 0.001734804 0.9074596
## RF-B (mtry ~ 0.25*p, larger leaves) 0.9302337 0.002245681 0.9067131
##                                           acc_sd
## RF-A (mtry ~ sqrt(p))               0.0008046334
## RF-B (mtry ~ 0.25*p, larger leaves) 0.0005903118

Fit Models

fit_A <- fit_rf(form, train, num.trees = cfg_A$num.trees, mtry = cfg_A$mtry,
                min.node.size = cfg_A$min.node.size, max.depth = cfg_A$max.depth,
                sample.fraction = cfg_A$sample.fraction)

fit_B <- fit_rf(form, train, num.trees = cfg_B$num.trees, mtry = cfg_B$mtry,
                min.node.size = cfg_B$min.node.size, max.depth = cfg_B$max.depth,
                sample.fraction = cfg_B$sample.fraction)

prob_A <- predict(fit_A, data = test, type = "response")$predictions[, pos_level]
cls_A  <- factor(ifelse(prob_A >= 0.5, pos_level, levels(train$target)[2]),
                 levels = levels(train$target))

prob_B <- predict(fit_B, data = test, type = "response")$predictions[, pos_level]
cls_B  <- factor(ifelse(prob_B >= 0.5, pos_level, levels(train$target)[2]),
                 levels = levels(train$target))

Evaluation Metrics

test_metrics <- function(lbl) {
  function(prob, cls) {
    data.frame(
      model    = lbl,
      accuracy = safe_acc(test$target, cls),
      auc      = safe_auc(test$target, prob, pos_label = pos_level),
      precision = {
        positive <- pos_level
        tp <- sum(test$target == positive & cls == positive)
        fp <- sum(test$target != positive & cls == positive)
        if (tp + fp == 0) NA_real_ else tp/(tp+fp)
      },
      recall = {
        positive <- pos_level
        tp <- sum(test$target == positive & cls == positive)
        fn <- sum(test$target == positive & cls != positive)
        if (tp + fn == 0) NA_real_ else tp/(tp+fn)
      }
    )
  }
}

met_A <- test_metrics("RF-A")(prob_A, cls_A)
met_B <- test_metrics("RF-B")(prob_B, cls_B)

cat("\n=== Test Results ===\n")

## 
## === Test Results ===

print(rbind(met_A, met_B))

##   model  accuracy       auc precision    recall
## 1  RF-A 0.9077740 0.9331561 0.6637427 0.4291115
## 2  RF-B 0.9075528 0.9330660 0.6632353 0.4262760

Variable Importance

#Refit both models with variable importance enabled
fit_A_imp <- ranger(
  formula = form, data = train,
  num.trees = cfg_A$num.trees,
  mtry = cfg_A$mtry,
  min.node.size = cfg_A$min.node.size,
  max.depth = cfg_A$max.depth,
  sample.fraction = cfg_A$sample.fraction,
  probability = TRUE, classification = TRUE,
  respect.unordered.factors = "order",
  importance = "impurity",        # <--- change to "permutation" for permutation importance
  num.threads = max(1, parallel::detectCores() - 1),
  seed = 42
)

fit_B_imp <- ranger(
  formula = form, data = train,
  num.trees = cfg_B$num.trees,
  mtry = cfg_B$mtry,
  min.node.size = cfg_B$min.node.size,
  max.depth = cfg_B$max.depth,
  sample.fraction = cfg_B$sample.fraction,
  probability = TRUE, classification = TRUE,
  respect.unordered.factors = "order",
  importance = "impurity",        # <--- or "permutation"
  num.threads = max(1, parallel::detectCores() - 1),
  seed = 42
)

imp_A <- sort(fit_A_imp$variable.importance, decreasing = TRUE)
imp_B <- sort(fit_B_imp$variable.importance, decreasing = TRUE)

imp_A_df <- data.frame(feature = names(imp_A), importance = as.numeric(imp_A), row.names = NULL)
imp_B_df <- data.frame(feature = names(imp_B), importance = as.numeric(imp_B), row.names = NULL)

cat("\nTop 15 features — Model A\n")

## 
## Top 15 features — Model A

print(head(imp_A_df, 15))

##      feature importance
## 1   duration 1271.95532
## 2      month  411.53807
## 3   poutcome  349.76453
## 4    balance  334.53758
## 5        age  328.63157
## 6        day  297.23299
## 7      pdays  197.86732
## 8        job  152.76310
## 9   campaign  112.35607
## 10   housing   91.67150
## 11  previous   86.62537
## 12   contact   83.78486
## 13 education   73.48927
## 14   marital   58.11050
## 15      loan   30.23858

cat("\nTop 15 features — Model B\n")

## 
## Top 15 features — Model B

print(head(imp_B_df, 15))

##      feature importance
## 1   duration 1217.47812
## 2      month  390.20289
## 3   poutcome  350.77981
## 4        age  286.92198
## 5    balance  282.76165
## 6        day  258.84177
## 7      pdays  178.81014
## 8        job  128.56799
## 9   campaign   94.22775
## 10   housing   84.86288
## 11   contact   79.95396
## 12  previous   78.10499
## 13 education   60.27271
## 14   marital   47.58503
## 15      loan   25.39396

Adaboost Experimentation

Prep data

#Train/Test split
idx <- caret::createDataPartition(df$target, p = 0.8, list = FALSE)
train <- df[idx, ]
test  <- df[-idx, ]
pos_level <- levels(train$target)[1]  # positive is FIRST

# CV folds
make_strict_folds <- function(y, k = 3, max_tries = 200) {
  for (i in 1:max_tries) {
    f <- caret::createFolds(y, k = k, returnTrain = TRUE)
    ok <- all(vapply(f, function(tr) length(unique(y[-tr])) == 2, logical(1)))
    if (ok) return(f)
  }
}
fold_ids <- make_strict_folds(train$target, k = 3)


safe_auc <- function(y_true, y_prob_pos) {
  y_true <- droplevels(y_true)
  if (nlevels(y_true) < 2) return(NA_real_)
  as.numeric(roc(y_true, y_prob_pos, levels = rev(levels(y_true)), quiet = TRUE)$auc)
}
safe_acc <- function(y_true, y_pred) mean(y_true == y_pred)

metrics_full <- function(y_true, y_pred, y_prob_pos) {
  y_true <- droplevels(y_true)
  positive <- levels(y_true)[1]
  tp <- sum(y_true == positive & y_pred == positive)
  fp <- sum(y_true != positive & y_pred == positive)
  fn <- sum(y_true == positive & y_pred != positive)
  tn <- sum(y_true != positive & y_pred != positive)
  acc <- (tp + tn) / (tp + tn + fp + fn)
  pre <- ifelse(tp + fp == 0, NA, tp / (tp + fp))
  rec <- ifelse(tp + fn == 0, NA, tp / (tp + fn))
  f1  <- ifelse(is.na(pre) | is.na(rec) | (pre + rec == 0), NA, 2 * pre * rec / (pre + rec))
  auc <- safe_auc(y_true, y_prob_pos)
  data.frame(accuracy = acc, precision = pre, recall = rec, f1 = f1, auc = auc, check.names = FALSE)
}

Model Experimentation

# Model configurations
form <- target ~ .
cfg_A <- list(iter = 150, depth = 2)   # shallower, fewer rounds
cfg_B <- list(iter = 250, depth = 3)   # slightly deeper, more rounds

fit_ada <- function(cfg, data) {
  ada(
    formula = form,
    data = data,
    iter = cfg$iter,
    nu = 1,                                  # learning rate
    control = rpart.control(maxdepth = cfg$depth, cp = 0)
  )
}

# Modify predictor for ada
predict_ada <- function(model, newdata, pos_level) {
  # Try to get both classes + probabilities
  pr_both <- tryCatch(predict(model, newdata, type = "both"), error = function(e) NULL)

  if (!is.null(pr_both)) {
    prob <- pr_both$prob
    cls  <- factor(pr_both$class, levels = levels(newdata$target))
  } else {
    prob <- tryCatch(predict(model, newdata, type = "prob"),   error = function(e) NULL)
    cls  <- factor(tryCatch(predict(model, newdata, type = "vector"), error = function(e) NA),
                   levels = levels(newdata$target))
  }

  # Coerce to matrix and ensure column names
  if (is.null(prob)) {
    prob_pos <- rep(NA_real_, nrow(newdata))
  } else {
    prob_mat <- as.matrix(prob)

    if (is.null(colnames(prob_mat)) && ncol(prob_mat) == length(levels(newdata$target))) {
      colnames(prob_mat) <- levels(newdata$target)
    }

    if (!is.null(colnames(prob_mat)) && pos_level %in% colnames(prob_mat)) {
      prob_pos <- prob_mat[, pos_level]
    } else if (ncol(prob_mat) == 2) {
      # Fall back: positive is FIRST factor level per our setup
      prob_pos <- prob_mat[, 1]
    } else if (is.vector(prob) && length(prob) == nrow(newdata)) {
      prob_pos <- as.numeric(prob)
    } else {
      prob_pos <- rep(NA_real_, nrow(newdata))
    }
  }

  list(prob_pos = prob_pos, cls = cls)
}

# 3-fold CV
cv_eval <- function(cfg) {
  aucs <- c(); accs <- c()
  for (i in seq_along(fold_ids)) {
    tr_ix <- fold_ids[[i]]
    tr <- train[tr_ix, ]
    va <- train[-tr_ix, ]

    fit  <- fit_ada(cfg, tr)
    pred <- predict_ada(fit, va, pos_level)

    aucs[i] <- safe_auc(va$target, pred$prob_pos)
    accs[i] <- safe_acc(va$target, pred$cls)
  }
  c(AUC_mean = mean(aucs, na.rm=TRUE), AUC_sd = sd(aucs, na.rm=TRUE),
    ACC_mean = mean(accs), ACC_sd = sd(accs))
}

Evaluation

# CV results
cv_A <- cv_eval(cfg_A)
cv_B <- cv_eval(cfg_B)

cat("\n=== CV Results (AdaBoost via `ada`) ===\n")

## 
## === CV Results (AdaBoost via `ada`) ===

print(rbind(`ADA-A` = cv_A, `ADA-B` = cv_B))

##        AUC_mean      AUC_sd  ACC_mean       ACC_sd
## ADA-A 0.9230700 0.006310872 0.9033176 0.0028190190
## ADA-B 0.9246751 0.002834275 0.9037047 0.0002239786

# Final Metrics
fit_A <- fit_ada(cfg_A, train)
fit_B <- fit_ada(cfg_B, train)

pred_A <- predict_ada(fit_A, test, pos_level)
pred_B <- predict_ada(fit_B, test, pos_level)

met_A <- metrics_full(test$target, pred_A$cls, pred_A$prob_pos); met_A$model <- "ADA-A"
met_B <- metrics_full(test$target, pred_B$cls, pred_B$prob_pos); met_B$model <- "ADA-B"

cat("\n=== Test Results ===\n")

## 
## === Test Results ===

print(rbind(
  met_A[, c("model","accuracy","precision","recall","f1","auc")],
  met_B[, c("model","accuracy","precision","recall","f1","auc")]
))

##   model  accuracy precision    recall        f1       auc
## 1 ADA-A 0.9014490 0.6155989 0.4181646 0.4980282 0.9185975
## 2 ADA-B 0.9034399 0.6167513 0.4597919 0.5268293 0.9235401

Assignment 02 - Experimentation & Training with Bank Data

Alex Ptacek

Load Packages

Load Data

Decision Tree Experimentation

Prep data

First Model

Second Model

Evaluation Metrics

Visualize Trees

Random Forest Experimentation

Prep Data

Prep Models

Fit Models

Evaluation Metrics

Variable Importance

Adaboost Experimentation

Prep data

Model Experimentation

Evaluation