Introduction

This analysis improves on prior studies (Decision Tree, Random Forest, and AdaBoost) by extending the model training phase with Support Vector Machines (SVMs). The goal is to compare linear and nonlinear kernels (RBF) to identify which has greater predictive potential for customer term deposit subscriptions.

Precision, recall, and F1-score translate directly into marketing efficiency KPIs, ensuring that outreach activities target potential subscribers while minimizing communication resource waste.

# Load Library & Prepare Data
library(tidyverse)

## Warning: package 'readr' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)
library(e1071)
library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

bank_data <- read_csv("clean_bank_data.csv")

## Rows: 45211 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): job, marital, education, default, housing, loan, contact, month, p...
## dbl  (7): age, balance, day, duration, campaign, pdays, previous
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Convert target variable
bank_data$y <- factor(bank_data$y, levels = c("no", "yes"))

# Scale numeric columns
num_cols <- sapply(bank_data, is.numeric)
bank_data[num_cols] <- scale(bank_data[num_cols])

# Sample smaller subset to reduce lag
set.seed(123)
bank_data <- bank_data %>% sample_n(5000)

# Split Train/Test (80/20)
set.seed(123)
trainIndex <- createDataPartition(bank_data$y, p = 0.8, list = FALSE)
train_bank_data <- bank_data[trainIndex, ]
test_bank_data  <- bank_data[-trainIndex, ]

# Evaluation Function
eval_model <- function(truth, pred_class, pred_prob, positive = "yes") {
  cm <- caret::confusionMatrix(pred_class, truth, positive = positive)
  auc_val <- NA
  if (!is.null(pred_prob)) {
    roc_obj <- try(pROC::roc(response = truth, predictor = pred_prob, 
                             levels = rev(levels(truth))), silent = TRUE)
    if (!inherits(roc_obj, "try-error")) auc_val <- as.numeric(pROC::auc(roc_obj))
  }
  tibble(
    Accuracy = cm$overall["Accuracy"],
    Precision = cm$byClass["Pos Pred Value"],
    Recall = cm$byClass["Sensitivity"],
    F1 = cm$byClass["F1"],
    AUC = auc_val
  )
}

#  Initialize Results
results <- tibble(
  Model = character(),
  Hyperparameters = character(),
  Accuracy = numeric(),
  Precision = numeric(),
  Recall = numeric(),
  F1 = numeric(),
  AUC = numeric()
)

Model Training and Evaluation

# Decision Tree

set.seed(100)
dt_model <- rpart(y ~ ., data = train_bank_data, method = "class",
                  control = rpart.control(maxdepth = 5, minsplit = 20))

dt_pred_class <- predict(dt_model, test_bank_data, type = "class")
dt_pred_prob  <- predict(dt_model, test_bank_data, type = "prob")[, "yes"]
dt_metrics <- eval_model(test_bank_data$y, dt_pred_class, dt_pred_prob)

## Setting direction: controls > cases

results <- results %>%
  add_row(
    Model = "Decision Tree",
    Hyperparameters = "maxdepth=5, minsplit=20",
    Accuracy = dt_metrics$Accuracy,
    Precision = dt_metrics$Precision,
    Recall = dt_metrics$Recall,
    F1 = dt_metrics$F1,
    AUC = dt_metrics$AUC
  )

# Linear SVM

set.seed(111)
C_grid_linear <- c(0.1, 1, 10)
best_lin <- NULL
best_lin_auc <- -Inf

for (C_val in C_grid_linear) {
  cat("Training Linear SVM with C =", C_val, "...\n")
  model_try <- try(
    svm(y ~ ., data = train_bank_data, kernel = "linear", cost = C_val, probability = TRUE),
    silent = TRUE
  )
  if (inherits(model_try, "try-error")) {
    cat("  Failed at C =", C_val, "\n")
    next
  }
  svm_lin <- model_try
  pred_obj <- predict(svm_lin, newdata = test_bank_data, probability = TRUE)
  probs <- attr(pred_obj, "probabilities")[, "yes"]
  metrics <- eval_model(test_bank_data$y, factor(pred_obj, levels = c("no", "yes")), probs)
  
  if (!is.na(metrics$AUC) && metrics$AUC > best_lin_auc) {
    best_lin_auc <- metrics$AUC
    best_lin <- list(C = C_val, metrics = metrics)
  }
}

## Training Linear SVM with C = 0.1 ...

## Setting direction: controls > cases

## Training Linear SVM with C = 1 ...

## Setting direction: controls > cases

## Training Linear SVM with C = 10 ...

## Setting direction: controls > cases

results <- results %>%
  add_row(
    Model = "SVM (Linear)",
    Hyperparameters = paste0("C=", best_lin$C),
    Accuracy = best_lin$metrics$Accuracy,
    Precision = best_lin$metrics$Precision,
    Recall = best_lin$metrics$Recall,
    F1 = best_lin$metrics$F1,
    AUC = best_lin$metrics$AUC
  )

# RBF SVM

set.seed(111)
C_grid_rbf <- c(0.5, 1)
gamma_grid_rbf <- c(0.01, 0.05)
best_rbf <- NULL
best_auc <- -Inf

for (C_val in C_grid_rbf) {
  for (g_val in gamma_grid_rbf) {
    cat("Training RBF SVM with C =", C_val, "and gamma =", g_val, "...\n")
    model_try <- try(
      svm(y ~ ., data = train_bank_data, kernel = "radial", cost = C_val, gamma = g_val, probability = TRUE),
      silent = TRUE
    )
    if (inherits(model_try, "try-error")) {
      cat("  Failed at C =", C_val, "gamma =", g_val, "\n")
      next
    }
    svm_rbf <- model_try
    pred_obj <- predict(svm_rbf, newdata = test_bank_data, probability = TRUE)
    probs <- attr(pred_obj, "probabilities")[, "yes"]
    metrics <- eval_model(test_bank_data$y, factor(pred_obj, levels = c("no", "yes")), probs)
    
    if (!is.na(metrics$AUC) && metrics$AUC > best_auc) {
      best_auc <- metrics$AUC
      best_rbf <- list(C = C_val, gamma = g_val, metrics = metrics)
    }
  }
}

## Training RBF SVM with C = 0.5 and gamma = 0.01 ...

## Setting direction: controls > cases

## Training RBF SVM with C = 0.5 and gamma = 0.05 ...

## Setting direction: controls > cases

## Training RBF SVM with C = 1 and gamma = 0.01 ...

## Setting direction: controls > cases

## Training RBF SVM with C = 1 and gamma = 0.05 ...

## Setting direction: controls > cases

results <- results %>%
  add_row(
    Model = "SVM (RBF)",
    Hyperparameters = paste0("C=", best_rbf$C, ", gamma=", best_rbf$gamma),
    Accuracy = best_rbf$metrics$Accuracy,
    Precision = best_rbf$metrics$Precision,
    Recall = best_rbf$metrics$Recall,
    F1 = best_rbf$metrics$F1,
    AUC = best_rbf$metrics$AUC
  )

# Final Results 
results <- results %>%
  mutate(across(where(is.numeric), round, 3))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 3)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

knitr::kable(results, caption = "Model Performance Comparison: Decision Tree vs SVM")

Model Performance Comparison: Decision Tree vs SVM
Model	Hyperparameters	Accuracy	Precision	Recall	F1	AUC
Decision Tree	maxdepth=5, minsplit=20	0.898	0.629	0.367	0.463	0.845
SVM (Linear)	C=1	0.898	0.725	0.242	0.362	0.916
SVM (RBF)	C=0.5, gamma=0.01	0.912	0.776	0.375	0.506	0.920

Discussion and Conclusion

According to the findings, the SVM with RBF kernel had the best AUC and prediction accuracy, indicating better generalization on nonlinear patterns. The same performance of the Decision Tree and Linear SVM suggests that the complexity of customer subscription behaviors may not be adequately captured by linear limits.

While the Decision Tree is still useful for interpretability, the RBF model’s variable margin strikes a balance between low bias and controlled variance. From a business standpoint, combining the accuracy of SVM models with the interpretability of Decision Trees can increase targeting effectiveness and save marketing expenses.

#References

Ahmad, F. et al. (2021). A comparative study of machine learning classifiers for prediction of COVID-19 infection susceptibility. Complexity, 2021.

Guhathakurata, S., Banerjee, A., & Roy, A. (2021). Use of support vector machine (SVM) to predict COVID-19 infection. Bioinformation, 17(5), 454–460.

Mehrpour, O., Zamani, N., & Hassanian-Moghaddam, H. (2022). Comparison of decision tree and support vector machine algorithms in predicting outcomes of poisoning cases. BMC Medical Informatics and Decision Making, 22(1), 121. ```

Comparative Analysis of Decision Tree & SVM

Woodelyne Durosier

2025-11-13

Introduction

Model Training and Evaluation

Discussion and Conclusion