This analysis improves on prior studies (Decision Tree, Random Forest, and AdaBoost) by extending the model training phase with Support Vector Machines (SVMs). The goal is to compare linear and nonlinear kernels (RBF) to identify which has greater predictive potential for customer term deposit subscriptions.
Precision, recall, and F1-score translate directly into marketing efficiency KPIs, ensuring that outreach activities target potential subscribers while minimizing communication resource waste.
# Load Library & Prepare Data
library(tidyverse)
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(e1071)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
bank_data <- read_csv("clean_bank_data.csv")
## Rows: 45211 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): job, marital, education, default, housing, loan, contact, month, p...
## dbl (7): age, balance, day, duration, campaign, pdays, previous
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert target variable
bank_data$y <- factor(bank_data$y, levels = c("no", "yes"))
# Scale numeric columns
num_cols <- sapply(bank_data, is.numeric)
bank_data[num_cols] <- scale(bank_data[num_cols])
# Sample smaller subset to reduce lag
set.seed(123)
bank_data <- bank_data %>% sample_n(5000)
# Split Train/Test (80/20)
set.seed(123)
trainIndex <- createDataPartition(bank_data$y, p = 0.8, list = FALSE)
train_bank_data <- bank_data[trainIndex, ]
test_bank_data <- bank_data[-trainIndex, ]
# Evaluation Function
eval_model <- function(truth, pred_class, pred_prob, positive = "yes") {
cm <- caret::confusionMatrix(pred_class, truth, positive = positive)
auc_val <- NA
if (!is.null(pred_prob)) {
roc_obj <- try(pROC::roc(response = truth, predictor = pred_prob,
levels = rev(levels(truth))), silent = TRUE)
if (!inherits(roc_obj, "try-error")) auc_val <- as.numeric(pROC::auc(roc_obj))
}
tibble(
Accuracy = cm$overall["Accuracy"],
Precision = cm$byClass["Pos Pred Value"],
Recall = cm$byClass["Sensitivity"],
F1 = cm$byClass["F1"],
AUC = auc_val
)
}
# Initialize Results
results <- tibble(
Model = character(),
Hyperparameters = character(),
Accuracy = numeric(),
Precision = numeric(),
Recall = numeric(),
F1 = numeric(),
AUC = numeric()
)
# Decision Tree
set.seed(100)
dt_model <- rpart(y ~ ., data = train_bank_data, method = "class",
control = rpart.control(maxdepth = 5, minsplit = 20))
dt_pred_class <- predict(dt_model, test_bank_data, type = "class")
dt_pred_prob <- predict(dt_model, test_bank_data, type = "prob")[, "yes"]
dt_metrics <- eval_model(test_bank_data$y, dt_pred_class, dt_pred_prob)
## Setting direction: controls > cases
results <- results %>%
add_row(
Model = "Decision Tree",
Hyperparameters = "maxdepth=5, minsplit=20",
Accuracy = dt_metrics$Accuracy,
Precision = dt_metrics$Precision,
Recall = dt_metrics$Recall,
F1 = dt_metrics$F1,
AUC = dt_metrics$AUC
)
# Linear SVM
set.seed(111)
C_grid_linear <- c(0.1, 1, 10)
best_lin <- NULL
best_lin_auc <- -Inf
for (C_val in C_grid_linear) {
cat("Training Linear SVM with C =", C_val, "...\n")
model_try <- try(
svm(y ~ ., data = train_bank_data, kernel = "linear", cost = C_val, probability = TRUE),
silent = TRUE
)
if (inherits(model_try, "try-error")) {
cat(" Failed at C =", C_val, "\n")
next
}
svm_lin <- model_try
pred_obj <- predict(svm_lin, newdata = test_bank_data, probability = TRUE)
probs <- attr(pred_obj, "probabilities")[, "yes"]
metrics <- eval_model(test_bank_data$y, factor(pred_obj, levels = c("no", "yes")), probs)
if (!is.na(metrics$AUC) && metrics$AUC > best_lin_auc) {
best_lin_auc <- metrics$AUC
best_lin <- list(C = C_val, metrics = metrics)
}
}
## Training Linear SVM with C = 0.1 ...
## Setting direction: controls > cases
## Training Linear SVM with C = 1 ...
## Setting direction: controls > cases
## Training Linear SVM with C = 10 ...
## Setting direction: controls > cases
results <- results %>%
add_row(
Model = "SVM (Linear)",
Hyperparameters = paste0("C=", best_lin$C),
Accuracy = best_lin$metrics$Accuracy,
Precision = best_lin$metrics$Precision,
Recall = best_lin$metrics$Recall,
F1 = best_lin$metrics$F1,
AUC = best_lin$metrics$AUC
)
# RBF SVM
set.seed(111)
C_grid_rbf <- c(0.5, 1)
gamma_grid_rbf <- c(0.01, 0.05)
best_rbf <- NULL
best_auc <- -Inf
for (C_val in C_grid_rbf) {
for (g_val in gamma_grid_rbf) {
cat("Training RBF SVM with C =", C_val, "and gamma =", g_val, "...\n")
model_try <- try(
svm(y ~ ., data = train_bank_data, kernel = "radial", cost = C_val, gamma = g_val, probability = TRUE),
silent = TRUE
)
if (inherits(model_try, "try-error")) {
cat(" Failed at C =", C_val, "gamma =", g_val, "\n")
next
}
svm_rbf <- model_try
pred_obj <- predict(svm_rbf, newdata = test_bank_data, probability = TRUE)
probs <- attr(pred_obj, "probabilities")[, "yes"]
metrics <- eval_model(test_bank_data$y, factor(pred_obj, levels = c("no", "yes")), probs)
if (!is.na(metrics$AUC) && metrics$AUC > best_auc) {
best_auc <- metrics$AUC
best_rbf <- list(C = C_val, gamma = g_val, metrics = metrics)
}
}
}
## Training RBF SVM with C = 0.5 and gamma = 0.01 ...
## Setting direction: controls > cases
## Training RBF SVM with C = 0.5 and gamma = 0.05 ...
## Setting direction: controls > cases
## Training RBF SVM with C = 1 and gamma = 0.01 ...
## Setting direction: controls > cases
## Training RBF SVM with C = 1 and gamma = 0.05 ...
## Setting direction: controls > cases
results <- results %>%
add_row(
Model = "SVM (RBF)",
Hyperparameters = paste0("C=", best_rbf$C, ", gamma=", best_rbf$gamma),
Accuracy = best_rbf$metrics$Accuracy,
Precision = best_rbf$metrics$Precision,
Recall = best_rbf$metrics$Recall,
F1 = best_rbf$metrics$F1,
AUC = best_rbf$metrics$AUC
)
# Final Results
results <- results %>%
mutate(across(where(is.numeric), round, 3))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 3)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
knitr::kable(results, caption = "Model Performance Comparison: Decision Tree vs SVM")
| Model | Hyperparameters | Accuracy | Precision | Recall | F1 | AUC |
|---|---|---|---|---|---|---|
| Decision Tree | maxdepth=5, minsplit=20 | 0.898 | 0.629 | 0.367 | 0.463 | 0.845 |
| SVM (Linear) | C=1 | 0.898 | 0.725 | 0.242 | 0.362 | 0.916 |
| SVM (RBF) | C=0.5, gamma=0.01 | 0.912 | 0.776 | 0.375 | 0.506 | 0.920 |
According to the findings, the SVM with RBF kernel had the best AUC and prediction accuracy, indicating better generalization on nonlinear patterns. The same performance of the Decision Tree and Linear SVM suggests that the complexity of customer subscription behaviors may not be adequately captured by linear limits.
While the Decision Tree is still useful for interpretability, the RBF model’s variable margin strikes a balance between low bias and controlled variance. From a business standpoint, combining the accuracy of SVM models with the interpretability of Decision Trees can increase targeting effectiveness and save marketing expenses.
#References
Ahmad, F. et al. (2021). A comparative study of machine learning classifiers for prediction of COVID-19 infection susceptibility. Complexity, 2021.
Guhathakurata, S., Banerjee, A., & Roy, A. (2021). Use of support vector machine (SVM) to predict COVID-19 infection. Bioinformation, 17(5), 454–460.
Mehrpour, O., Zamani, N., & Hassanian-Moghaddam, H. (2022). Comparison of decision tree and support vector machine algorithms in predicting outcomes of poisoning cases. BMC Medical Informatics and Decision Making, 22(1), 121. ```