Decision Tree Experimentation
Prep data
# Coerce target data to factor
df$target <- factor(df$target)
# Create a split that keeps both classes in train and test
make_split_with_both_classes <- function(dat, p = 0.8, strata = "target", max_tries = 100) {
for (i in 1:max_tries) {
idx <- createDataPartition(dat[[strata]], p = p, list = FALSE)
train <- dat[idx, , drop = FALSE]
test <- dat[-idx, , drop = FALSE]
if (nlevels(droplevels(train[[strata]])) == 2 &&
nlevels(droplevels(test[[strata]])) == 2) {
return(list(train = train, test = test))
}
}
}
splits <- make_split_with_both_classes(df, p = 0.8, strata = "target")
train <- splits$train
test <- splits$test
# Create function to organize result metrics
metrics <- function(y_true, y_pred_class, y_pred_prob = NULL) {
y_true <- droplevels(y_true)
lv <- levels(y_true)
# Confusion-matrix-based metrics for two levels
if (nlevels(y_true) == 2) {
positive <- lv[2]
tp <- sum(y_true == positive & y_pred_class == positive)
fp <- sum(y_true != positive & y_pred_class == positive)
fn <- sum(y_true == positive & y_pred_class != positive)
tn <- sum(y_true != positive & y_pred_class != positive)
accuracy <- (tp + tn) / (tp + tn + fp + fn)
precision <- ifelse(tp + fp == 0, NA, tp / (tp + fp))
recall <- ifelse(tp + fn == 0, NA, tp / (tp + fn))
f1 <- ifelse(is.na(precision) | is.na(recall) | (precision + recall == 0),
NA, 2 * precision * recall / (precision + recall))
# AUC only if we have a prob column for the positive class
auc <- NA
if (!is.null(y_pred_prob)) {
auc <- as.numeric(roc(y_true, y_pred_prob, levels = lv, quiet = TRUE)$auc)
}
return(data.frame(accuracy, precision, recall, f1, auc, check.names = FALSE))
} else {
accuracy <- mean(y_pred_class == y_true)
return(data.frame(accuracy = accuracy,
precision = NA, recall = NA, f1 = NA, auc = NA,
check.names = FALSE))
}
}
fold_ids <- createFolds(train$target, k = 5, returnTrain = TRUE)
ctrl_shared <- trainControl(
method = "cv",
index = fold_ids,
classProbs = TRUE,
summaryFunction = twoClassSummary, # optimize ROC
savePredictions = "final"
)
pos_level <- tail(levels(train$target), 1)
First Model
m_base <- train(
target ~ ., data = train,
method = "rpart",
trControl = ctrl_shared,
metric = "ROC",
tuneLength = 20
)
Second Model
m_depth <- train(
target ~ ., data = train,
method = "rpart2", # tunes maxdepth instead of cp
trControl = ctrl_shared,
metric = "ROC",
tuneLength = 15
)
## note: only 8 possible values of the max tree depth from the initial fit.
## Truncating the grid to 8 .
Evaluation Metrics
# Evaluate both models on same test set
p_base_prob <- predict(m_base, newdata = test, type = "prob")[, pos_level, drop = TRUE]
p_base_cls <- predict(m_base, newdata = test)
m_base_metrics <- metrics(test$target, p_base_cls, p_base_prob)
m_base_metrics$model <- "Tuned (cp)"
p_depth_prob <- predict(m_depth, newdata = test, type = "prob")[, pos_level, drop = TRUE]
p_depth_cls <- predict(m_depth, newdata = test)
m_depth_metrics <- metrics(test$target, p_depth_cls, p_depth_prob)
m_depth_metrics$model <- "Tuned (maxdepth)"
results <- rbind(
m_base_metrics[, c("model","accuracy","precision","recall","f1","auc")],
m_depth_metrics[, c("model","accuracy","precision","recall","f1","auc")]
)
print(results)
## model accuracy precision recall f1 auc
## 1 Tuned (cp) 0.9074217 0.6540616 0.4418165 0.5273857 0.8639102
## 2 Tuned (maxdepth) 0.9013383 0.6565465 0.3273415 0.4368687 0.7490713
Visualize Trees
rpart.plot(m_base$finalModel, main = sprintf("Base (cp): cp=%.4f", m_base$bestTune$cp))
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

rpart.plot(m_depth$finalModel, main = sprintf("Variant (maxdepth): depth=%d", m_depth$bestTune$maxdepth))

Random Forest Experimentation
Prep Data
# Positive class FIRST (for consistent AUC direction later)
pos <- tail(levels(df$target), 1)
neg <- setdiff(levels(df$target), pos)[1]
df$target <- factor(df$target, levels = c(pos, neg)) # positive first
# Light predictor cleanup
char_cols <- names(df)[sapply(df, is.character)]
for (cc in char_cols) df[[cc]] <- as.factor(df[[cc]])
# Train/Test split
spl <- initial_split(df, prop = 0.8, strata = target)
train <- training(spl)
test <- testing(spl)
pos_level <- levels(train$target)[1] # positive is first
safe_auc <- function(y_true, y_prob_pos, pos_label = pos_level) {
y_true <- droplevels(y_true)
if (nlevels(y_true) < 2) return(NA_real_)
# pROC expects levels=c(neg,pos); we already have pos first, so reverse:
as.numeric(roc(response = y_true, predictor = y_prob_pos,
levels = rev(levels(y_true)), quiet = TRUE)$auc)
}
safe_acc <- function(y_true, y_pred) mean(y_true == y_pred)
Prep Models
# one-fit-one-predict for ranger (classification)
fit_rf <- function(form, data, num.trees = 200, mtry = NULL,
min.node.size = 10, max.depth = 20, sample.fraction = 0.7) {
ranger(
formula = form,
data = data,
num.trees = num.trees,
mtry = mtry,
min.node.size = min.node.size,
max.depth = max.depth,
sample.fraction = sample.fraction,
probability = TRUE, # we need class probs for ROC
classification = TRUE,
respect.unordered.factors = "order", # fast, consistent
num.threads = max(1, parallel::detectCores() - 1),
seed = 42,
verbose = FALSE
)
}
# Two different RF models
p <- ncol(train) - 1
mtry_A <- max(1, round(sqrt(p)))
mtry_B <- max(1, round(0.25 * p))
cfg_A <- list(num.trees = 200, mtry = mtry_A, min.node.size = 10, max.depth = 20, sample.fraction = 0.7)
cfg_B <- list(num.trees = 300, mtry = mtry_B, min.node.size = 15, max.depth = 20, sample.fraction = 0.7)
form <- as.formula("target ~ .")
folds <- vfold_cv(train, v = 3, strata = target)
cv_eval <- function(cfg) {
aucs <- c(); accs <- c()
for (i in seq_len(nrow(folds))) {
tr_ix <- folds$splits[[i]]$in_id
tr <- train[tr_ix, , drop = FALSE]
va <- assessment(folds$splits[[i]])
# Fit on fold-train
fit <- fit_rf(form, tr, num.trees = cfg$num.trees, mtry = cfg$mtry,
min.node.size = cfg$min.node.size, max.depth = cfg$max.depth,
sample.fraction = cfg$sample.fraction)
# Predict on fold-assessment
prob <- predict(fit, data = va, type = "response")$predictions[, pos_level]
cls <- ifelse(prob >= 0.5, pos_level, levels(train$target)[2])
cls <- factor(cls, levels = levels(train$target))
# Metrics (skip AUC if single-class assessment)
aucs[i] <- safe_auc(va$target, prob, pos_label = pos_level)
accs[i] <- safe_acc(va$target, cls)
}
c(
auc_mean = mean(aucs, na.rm = TRUE),
auc_sd = sd(aucs, na.rm = TRUE),
acc_mean = mean(accs),
acc_sd = sd(accs)
)
}
cv_A <- cv_eval(cfg_A)
cv_B <- cv_eval(cfg_B)
cat("\n=== CV Results ===\n")
##
## === CV Results ===
print(rbind(
`RF-A (mtry ~ sqrt(p))` = cv_A,
`RF-B (mtry ~ 0.25*p, larger leaves)` = cv_B
))
## auc_mean auc_sd acc_mean
## RF-A (mtry ~ sqrt(p)) 0.9299809 0.001734804 0.9074596
## RF-B (mtry ~ 0.25*p, larger leaves) 0.9302337 0.002245681 0.9067131
## acc_sd
## RF-A (mtry ~ sqrt(p)) 0.0008046334
## RF-B (mtry ~ 0.25*p, larger leaves) 0.0005903118
Fit Models
fit_A <- fit_rf(form, train, num.trees = cfg_A$num.trees, mtry = cfg_A$mtry,
min.node.size = cfg_A$min.node.size, max.depth = cfg_A$max.depth,
sample.fraction = cfg_A$sample.fraction)
fit_B <- fit_rf(form, train, num.trees = cfg_B$num.trees, mtry = cfg_B$mtry,
min.node.size = cfg_B$min.node.size, max.depth = cfg_B$max.depth,
sample.fraction = cfg_B$sample.fraction)
prob_A <- predict(fit_A, data = test, type = "response")$predictions[, pos_level]
cls_A <- factor(ifelse(prob_A >= 0.5, pos_level, levels(train$target)[2]),
levels = levels(train$target))
prob_B <- predict(fit_B, data = test, type = "response")$predictions[, pos_level]
cls_B <- factor(ifelse(prob_B >= 0.5, pos_level, levels(train$target)[2]),
levels = levels(train$target))
Evaluation Metrics
test_metrics <- function(lbl) {
function(prob, cls) {
data.frame(
model = lbl,
accuracy = safe_acc(test$target, cls),
auc = safe_auc(test$target, prob, pos_label = pos_level),
precision = {
positive <- pos_level
tp <- sum(test$target == positive & cls == positive)
fp <- sum(test$target != positive & cls == positive)
if (tp + fp == 0) NA_real_ else tp/(tp+fp)
},
recall = {
positive <- pos_level
tp <- sum(test$target == positive & cls == positive)
fn <- sum(test$target == positive & cls != positive)
if (tp + fn == 0) NA_real_ else tp/(tp+fn)
}
)
}
}
met_A <- test_metrics("RF-A")(prob_A, cls_A)
met_B <- test_metrics("RF-B")(prob_B, cls_B)
cat("\n=== Test Results ===\n")
##
## === Test Results ===
print(rbind(met_A, met_B))
## model accuracy auc precision recall
## 1 RF-A 0.9077740 0.9331561 0.6637427 0.4291115
## 2 RF-B 0.9075528 0.9330660 0.6632353 0.4262760
Variable Importance
#Refit both models with variable importance enabled
fit_A_imp <- ranger(
formula = form, data = train,
num.trees = cfg_A$num.trees,
mtry = cfg_A$mtry,
min.node.size = cfg_A$min.node.size,
max.depth = cfg_A$max.depth,
sample.fraction = cfg_A$sample.fraction,
probability = TRUE, classification = TRUE,
respect.unordered.factors = "order",
importance = "impurity", # <--- change to "permutation" for permutation importance
num.threads = max(1, parallel::detectCores() - 1),
seed = 42
)
fit_B_imp <- ranger(
formula = form, data = train,
num.trees = cfg_B$num.trees,
mtry = cfg_B$mtry,
min.node.size = cfg_B$min.node.size,
max.depth = cfg_B$max.depth,
sample.fraction = cfg_B$sample.fraction,
probability = TRUE, classification = TRUE,
respect.unordered.factors = "order",
importance = "impurity", # <--- or "permutation"
num.threads = max(1, parallel::detectCores() - 1),
seed = 42
)
imp_A <- sort(fit_A_imp$variable.importance, decreasing = TRUE)
imp_B <- sort(fit_B_imp$variable.importance, decreasing = TRUE)
imp_A_df <- data.frame(feature = names(imp_A), importance = as.numeric(imp_A), row.names = NULL)
imp_B_df <- data.frame(feature = names(imp_B), importance = as.numeric(imp_B), row.names = NULL)
cat("\nTop 15 features — Model A\n")
##
## Top 15 features — Model A
print(head(imp_A_df, 15))
## feature importance
## 1 duration 1271.95532
## 2 month 411.53807
## 3 poutcome 349.76453
## 4 balance 334.53758
## 5 age 328.63157
## 6 day 297.23299
## 7 pdays 197.86732
## 8 job 152.76310
## 9 campaign 112.35607
## 10 housing 91.67150
## 11 previous 86.62537
## 12 contact 83.78486
## 13 education 73.48927
## 14 marital 58.11050
## 15 loan 30.23858
cat("\nTop 15 features — Model B\n")
##
## Top 15 features — Model B
print(head(imp_B_df, 15))
## feature importance
## 1 duration 1217.47812
## 2 month 390.20289
## 3 poutcome 350.77981
## 4 age 286.92198
## 5 balance 282.76165
## 6 day 258.84177
## 7 pdays 178.81014
## 8 job 128.56799
## 9 campaign 94.22775
## 10 housing 84.86288
## 11 contact 79.95396
## 12 previous 78.10499
## 13 education 60.27271
## 14 marital 47.58503
## 15 loan 25.39396
Adaboost Experimentation
Prep data
#Train/Test split
idx <- caret::createDataPartition(df$target, p = 0.8, list = FALSE)
train <- df[idx, ]
test <- df[-idx, ]
pos_level <- levels(train$target)[1] # positive is FIRST
# CV folds
make_strict_folds <- function(y, k = 3, max_tries = 200) {
for (i in 1:max_tries) {
f <- caret::createFolds(y, k = k, returnTrain = TRUE)
ok <- all(vapply(f, function(tr) length(unique(y[-tr])) == 2, logical(1)))
if (ok) return(f)
}
}
fold_ids <- make_strict_folds(train$target, k = 3)
safe_auc <- function(y_true, y_prob_pos) {
y_true <- droplevels(y_true)
if (nlevels(y_true) < 2) return(NA_real_)
as.numeric(roc(y_true, y_prob_pos, levels = rev(levels(y_true)), quiet = TRUE)$auc)
}
safe_acc <- function(y_true, y_pred) mean(y_true == y_pred)
metrics_full <- function(y_true, y_pred, y_prob_pos) {
y_true <- droplevels(y_true)
positive <- levels(y_true)[1]
tp <- sum(y_true == positive & y_pred == positive)
fp <- sum(y_true != positive & y_pred == positive)
fn <- sum(y_true == positive & y_pred != positive)
tn <- sum(y_true != positive & y_pred != positive)
acc <- (tp + tn) / (tp + tn + fp + fn)
pre <- ifelse(tp + fp == 0, NA, tp / (tp + fp))
rec <- ifelse(tp + fn == 0, NA, tp / (tp + fn))
f1 <- ifelse(is.na(pre) | is.na(rec) | (pre + rec == 0), NA, 2 * pre * rec / (pre + rec))
auc <- safe_auc(y_true, y_prob_pos)
data.frame(accuracy = acc, precision = pre, recall = rec, f1 = f1, auc = auc, check.names = FALSE)
}
Model Experimentation
# Model configurations
form <- target ~ .
cfg_A <- list(iter = 150, depth = 2) # shallower, fewer rounds
cfg_B <- list(iter = 250, depth = 3) # slightly deeper, more rounds
fit_ada <- function(cfg, data) {
ada(
formula = form,
data = data,
iter = cfg$iter,
nu = 1, # learning rate
control = rpart.control(maxdepth = cfg$depth, cp = 0)
)
}
# Modify predictor for ada
predict_ada <- function(model, newdata, pos_level) {
# Try to get both classes + probabilities
pr_both <- tryCatch(predict(model, newdata, type = "both"), error = function(e) NULL)
if (!is.null(pr_both)) {
prob <- pr_both$prob
cls <- factor(pr_both$class, levels = levels(newdata$target))
} else {
prob <- tryCatch(predict(model, newdata, type = "prob"), error = function(e) NULL)
cls <- factor(tryCatch(predict(model, newdata, type = "vector"), error = function(e) NA),
levels = levels(newdata$target))
}
# Coerce to matrix and ensure column names
if (is.null(prob)) {
prob_pos <- rep(NA_real_, nrow(newdata))
} else {
prob_mat <- as.matrix(prob)
if (is.null(colnames(prob_mat)) && ncol(prob_mat) == length(levels(newdata$target))) {
colnames(prob_mat) <- levels(newdata$target)
}
if (!is.null(colnames(prob_mat)) && pos_level %in% colnames(prob_mat)) {
prob_pos <- prob_mat[, pos_level]
} else if (ncol(prob_mat) == 2) {
# Fall back: positive is FIRST factor level per our setup
prob_pos <- prob_mat[, 1]
} else if (is.vector(prob) && length(prob) == nrow(newdata)) {
prob_pos <- as.numeric(prob)
} else {
prob_pos <- rep(NA_real_, nrow(newdata))
}
}
list(prob_pos = prob_pos, cls = cls)
}
# 3-fold CV
cv_eval <- function(cfg) {
aucs <- c(); accs <- c()
for (i in seq_along(fold_ids)) {
tr_ix <- fold_ids[[i]]
tr <- train[tr_ix, ]
va <- train[-tr_ix, ]
fit <- fit_ada(cfg, tr)
pred <- predict_ada(fit, va, pos_level)
aucs[i] <- safe_auc(va$target, pred$prob_pos)
accs[i] <- safe_acc(va$target, pred$cls)
}
c(AUC_mean = mean(aucs, na.rm=TRUE), AUC_sd = sd(aucs, na.rm=TRUE),
ACC_mean = mean(accs), ACC_sd = sd(accs))
}
Evaluation
# CV results
cv_A <- cv_eval(cfg_A)
cv_B <- cv_eval(cfg_B)
cat("\n=== CV Results (AdaBoost via `ada`) ===\n")
##
## === CV Results (AdaBoost via `ada`) ===
print(rbind(`ADA-A` = cv_A, `ADA-B` = cv_B))
## AUC_mean AUC_sd ACC_mean ACC_sd
## ADA-A 0.9230700 0.006310872 0.9033176 0.0028190190
## ADA-B 0.9246751 0.002834275 0.9037047 0.0002239786
# Final Metrics
fit_A <- fit_ada(cfg_A, train)
fit_B <- fit_ada(cfg_B, train)
pred_A <- predict_ada(fit_A, test, pos_level)
pred_B <- predict_ada(fit_B, test, pos_level)
met_A <- metrics_full(test$target, pred_A$cls, pred_A$prob_pos); met_A$model <- "ADA-A"
met_B <- metrics_full(test$target, pred_B$cls, pred_B$prob_pos); met_B$model <- "ADA-B"
cat("\n=== Test Results ===\n")
##
## === Test Results ===
print(rbind(
met_A[, c("model","accuracy","precision","recall","f1","auc")],
met_B[, c("model","accuracy","precision","recall","f1","auc")]
))
## model accuracy precision recall f1 auc
## 1 ADA-A 0.9014490 0.6155989 0.4181646 0.4980282 0.9185975
## 2 ADA-B 0.9034399 0.6167513 0.4597919 0.5268293 0.9235401