knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(caret)
library(tidyverse)
library(doParallel)
library(MLmetrics)
library(kableExtra)
library(e1071)
library(pROC)
library(PRROC)
library(kernlab)
set.seed(8675309)
# Stop cluster if it exists, RStudio will keep spinning new ones up otherwise
if (foreach::getDoParWorkers() > 1 && exists("cl") && inherits(cl, "cluster")) {
tryCatch({
stopCluster(cl)
message("Cluster stopped successfully.")
}, error = function(e) {
message("Error stopping cluster: ", e$message)
})
} else {
message("No active cluster to stop or cluster variable not defined.")
}
cl <- makeCluster(detectCores() - 2)
registerDoParallel(cl)
# Summary function with better metrics for evaluating performance on imbalanced datasets
imbalanced_summary <- function(data, lev = NULL, model = NULL) {
predictions <- data$pred
actual <- data$obs
tp <- sum(predictions == "yes" & actual == "yes")
tn <- sum(predictions == "no" & actual == "no")
fp <- sum(predictions == "yes" & actual == "no")
fn <- sum(predictions == "no" & actual == "yes")
rec <- tp / (tp + fn)
spec <- tn / (tn + fp)
if (tp + fp > 0) {
prec <- tp / (tp + fp)
} else {
prec <- 0
}
if (prec + rec > 0) {
f1 <- (2 * prec * rec) / (prec + rec)
} else {
f1 <- 0
}
g_mean <- sqrt(rec * spec)
auc_roc <- NA
auc_pr <- NA
if ("yes" %in% colnames(data)) {
tryCatch({
roc_obj <- pROC::roc(actual == "yes", data$yes, quiet = TRUE)
auc_roc <- as.numeric(pROC::auc(roc_obj))
actual_numeric <- as.numeric(actual == "yes")
pr_curve <- PRROC::pr.curve(scores.class0 = data$yes,
weights.class0 = actual_numeric,
curve = TRUE)
auc_pr <- pr_curve$auc.integral
}, error = function(e) {
# Just continue if there's an error
})
}
# Return metrics
out <- c(`AUC-ROC` = auc_roc, `AUC-PR` = auc_pr,
Recall = rec, Specificity = spec,
Precision = prec, F1 = f1, GMean = g_mean)
return(out)
}
The original, untransformed data will be used to train the tree models, however, the SVM models require more data preparation to achieve the best performance.
# Based on information in the "bank-additional-names.txt" file
column_types <- list(age="n", job="f", marital="f", education="f", default="f", housing="f", loan="f", contact="f", month="f", day_of_week="f", duration="n", campaign="n", pdays="n", previous="n", poutcome="f", emp.var.rate="d", cons.price.idx="d", cons.conf.idx="d", euribor3m="d", nr.employed="d", y="f")
bank_add_full <- read_delim("bank-additional/bank-additional-full.csv", delim = ";", col_types = column_types) |>
select(!duration)
# Releveling to make yes the positive class by default
bank_add_full$y <- factor(bank_add_full$y, levels = c("yes", "no"))
# Using small for tuning
bank_add <- read_delim("bank-additional/bank-additional.csv", delim = ";", col_types = column_types) |>
select(!duration)
bank_add$y <- factor(bank_add$y, levels = c("yes", "no"))
dataset_to_use <- bank_add_full
The categorical variables are going to be one hot encoded so the SVM models can utilize them without issue. Not all variables have a logical ordering which means factors are not appropriate for these cases. As one hot encoding is a reasonable alternative to assigning factors we will simply use one hot encoding in all cases.
set.seed(8675309)
cat_vars <- sapply(dataset_to_use[, !names(dataset_to_use) %in% c("y")], is.character)
cat_var_names <- names(cat_vars)
# Create dummyVars object - only for predictors, excluding the target
dummy_vars <- dummyVars(formula = "~ .", data = dataset_to_use[, c(cat_var_names)])
# Apply transformation to get dummy variables
dummy_data <- predict(dummy_vars, dataset_to_use[, c(cat_var_names)])
# Combine with numeric variables and target
numeric_vars <- dataset_to_use[, !names(dataset_to_use) %in% c(cat_var_names, "y")]
processed_data_caret <- cbind(numeric_vars, dummy_data, y = dataset_to_use$y)
Correlated variables can have a strong negative impact on the performance of SVMs, so we will be excluding highly correlated variables from the training and test sets for the SVM models. The tree based models will be trained on the original untransformed dataset as they are not vulnerable to correlated variables and are able to handle categorical variables.
correlated_columns <- findCorrelation(cor(processed_data_caret |> select(-y)), cutoff = 0.70, verbose = FALSE, names = TRUE)
good_set <- processed_data_caret |>
select(-correlated_columns)
set.seed(8675309)
inTraining <- createDataPartition(good_set$y, p = .8, list = FALSE)
training <- good_set[ inTraining,]
testing <- good_set[-inTraining,]
Due to extremely long training times for the SVM models on the available hardware when using the full dataset the decision was made to downsample the data. By downsampling the majority class we still achieve a balance between “yes” and “no” cases which should improve model performance while reducing the training time. This does mean that the models are training on significantly less data than if no sampling or upsampling had been conducted, and we should expect the model performance to reflect this. Unfortunately there was simply no reasonable alternative given the hardware restrictions.
The fields are centered and scaled prior to model training for the SVM models. Sensitivity will be the selection criteria for all models as we are largely concerned with identifying the “yes” cases. The tree based models are trained on the untransformed, non-encoded dataset with the train/test split being performed identically, meaning the actual observations in each set are the same for both model families. Even though the tree based models train significantly faster than the SVM models they will still be using down sampling to ensure parity for comparison.
The model was trained with 3 fold cross validation. The tuning values for the cost parameter were 0.01, 0.1, 1, 10, and 32.
set.seed(8675309)
linear_ctrl <- trainControl(
method = "cv",
number = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = TRUE,
sampling = "down",
allowParallel = FALSE
)
tuneGrid_linear <- expand.grid(
C = c(0.01,0.1, 1, 10, 32)
)
svm_linear <- train(
y ~ .,
data = training,
method = "svmLinear",
trControl = linear_ctrl,
tuneGrid = tuneGrid_linear,
metric = "sens",
preProcess = c("center", "scale")
)
## + Fold1: C= 0.01
## - Fold1: C= 0.01
## + Fold1: C= 0.10
## - Fold1: C= 0.10
## + Fold1: C= 1.00
## - Fold1: C= 1.00
## + Fold1: C=10.00
## - Fold1: C=10.00
## + Fold1: C=32.00
## - Fold1: C=32.00
## + Fold2: C= 0.01
## - Fold2: C= 0.01
## + Fold2: C= 0.10
## - Fold2: C= 0.10
## + Fold2: C= 1.00
## - Fold2: C= 1.00
## + Fold2: C=10.00
## - Fold2: C=10.00
## + Fold2: C=32.00
## - Fold2: C=32.00
## + Fold3: C= 0.01
## - Fold3: C= 0.01
## + Fold3: C= 0.10
## - Fold3: C= 0.10
## + Fold3: C= 1.00
## - Fold3: C= 1.00
## + Fold3: C=10.00
## - Fold3: C=10.00
## + Fold3: C=32.00
## - Fold3: C=32.00
## Aggregating results
## Selecting tuning parameters
## Fitting C = 10 on full training set
svm_linear_predictions <- predict(svm_linear, testing)
svm_linear_probabilities <- predict(svm_linear, testing, type = "prob")
linear_eval_data <- data.frame(pred = svm_linear_predictions, obs = testing$y, yes = svm_linear_probabilities$yes, no = svm_linear_probabilities$no)
linear_summary <- imbalanced_summary(data = linear_eval_data)
The model was trained with 3 fold cross validation. The tuning values for the cost parameter were 0.1, 1, and 10 and the sigma ranged from 0.01, 0.1, and 1. Larger ranges of hyperparameter values were prohibitive in terms of training time.
set.seed(8675309)
radial_ctrl <- trainControl(
method = "cv",
number = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = TRUE,
sampling = "down",
allowParallel = FALSE
)
tuneGrid_radial <- expand.grid(
sigma = c(0.01, 0.1, 1),
C = c(0.1, 1, 10)
)
svm_radial <- train(
y ~ .,
data = training,
method = "svmRadial",
trControl = radial_ctrl,
tuneGrid = tuneGrid_radial,
metric = "sens",
preProcess = c("center", "scale")
)
## + Fold1: sigma=0.01, C= 0.1
## - Fold1: sigma=0.01, C= 0.1
## + Fold1: sigma=0.10, C= 0.1
## - Fold1: sigma=0.10, C= 0.1
## + Fold1: sigma=1.00, C= 0.1
## - Fold1: sigma=1.00, C= 0.1
## + Fold1: sigma=0.01, C= 1.0
## - Fold1: sigma=0.01, C= 1.0
## + Fold1: sigma=0.10, C= 1.0
## - Fold1: sigma=0.10, C= 1.0
## + Fold1: sigma=1.00, C= 1.0
## - Fold1: sigma=1.00, C= 1.0
## + Fold1: sigma=0.01, C=10.0
## - Fold1: sigma=0.01, C=10.0
## + Fold1: sigma=0.10, C=10.0
## - Fold1: sigma=0.10, C=10.0
## + Fold1: sigma=1.00, C=10.0
## - Fold1: sigma=1.00, C=10.0
## + Fold2: sigma=0.01, C= 0.1
## - Fold2: sigma=0.01, C= 0.1
## + Fold2: sigma=0.10, C= 0.1
## - Fold2: sigma=0.10, C= 0.1
## + Fold2: sigma=1.00, C= 0.1
## - Fold2: sigma=1.00, C= 0.1
## + Fold2: sigma=0.01, C= 1.0
## - Fold2: sigma=0.01, C= 1.0
## + Fold2: sigma=0.10, C= 1.0
## - Fold2: sigma=0.10, C= 1.0
## + Fold2: sigma=1.00, C= 1.0
## - Fold2: sigma=1.00, C= 1.0
## + Fold2: sigma=0.01, C=10.0
## - Fold2: sigma=0.01, C=10.0
## + Fold2: sigma=0.10, C=10.0
## - Fold2: sigma=0.10, C=10.0
## + Fold2: sigma=1.00, C=10.0
## - Fold2: sigma=1.00, C=10.0
## + Fold3: sigma=0.01, C= 0.1
## - Fold3: sigma=0.01, C= 0.1
## + Fold3: sigma=0.10, C= 0.1
## - Fold3: sigma=0.10, C= 0.1
## + Fold3: sigma=1.00, C= 0.1
## - Fold3: sigma=1.00, C= 0.1
## + Fold3: sigma=0.01, C= 1.0
## - Fold3: sigma=0.01, C= 1.0
## + Fold3: sigma=0.10, C= 1.0
## - Fold3: sigma=0.10, C= 1.0
## + Fold3: sigma=1.00, C= 1.0
## - Fold3: sigma=1.00, C= 1.0
## + Fold3: sigma=0.01, C=10.0
## - Fold3: sigma=0.01, C=10.0
## + Fold3: sigma=0.10, C=10.0
## - Fold3: sigma=0.10, C=10.0
## + Fold3: sigma=1.00, C=10.0
## - Fold3: sigma=1.00, C=10.0
## Aggregating results
## Selecting tuning parameters
## Fitting sigma = 0.01, C = 1 on full training set
svm_radial_predictions <- predict(svm_radial, testing)
svm_radial_probabilities <- predict(svm_radial, testing, type = "prob")
radial_eval_data <- data.frame(pred = svm_radial_predictions, obs = testing$y, yes = svm_radial_probabilities$yes, no = svm_radial_probabilities$no)
radial_summary <- imbalanced_summary(data = radial_eval_data)
The model was trained with 3 fold cross validation. The tuning values for the model were the depth at 3, 4, and 5, the shrinkage factor across 0.05, 0.1, 0.2, and 0.3, and the number of boosting iterations at 100, 200, and 500. Other hyperparameters were held constant.
set.seed(8675309)
# For quick iteration
tree_training <- dataset_to_use[ inTraining,]
tree_testing <- dataset_to_use[-inTraining,]
set.seed(8675309)
xgb_ctrl <- trainControl(
method = "cv",
number = 3,
summaryFunction = twoClassSummary,
classProbs = TRUE,
verboseIter = TRUE,
savePredictions = "final",
sampling = "down"
)
xgb_grid <- expand.grid(
nrounds = c(100, 200, 500),
max_depth = c(3, 4, 5),
eta = c(0.05, 0.1, 0.2, 0.3),
gamma = 0.5,
colsample_bytree = 0.75,
min_child_weight = 5,
subsample = 1
)
xgb_model <- train(
y ~ .,
data = tree_training,
method = "xgbTree",
trControl = xgb_ctrl,
tuneGrid = xgb_grid,
metric = "sens"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 100, max_depth = 5, eta = 0.1, gamma = 0.5, colsample_bytree = 0.75, min_child_weight = 5, subsample = 1 on full training set
xgb_predictions <- predict(xgb_model, tree_testing)
xgb_probabilities <- predict(xgb_model, tree_testing, type = "prob")
xgb_eval_data <- data.frame(pred = xgb_predictions, obs = tree_testing$y, yes = xgb_probabilities$yes, no = xgb_probabilities$no)
xgb_summary <- imbalanced_summary(data = xgb_eval_data)
The model was trained with 3 fold cross validation. The tuning values for the model were the number of randomly selected predictors at 2, 4, 6, and 8 as well as the minimum node size at 1, 5, and 10.
set.seed(8675309)
rf_ctrl <- trainControl(
method = "cv",
number = 3,
summaryFunction = twoClassSummary,
classProbs = TRUE,
verboseIter = TRUE,
savePredictions = "final",
allowParallel = TRUE,
sampling = "down"
)
rf_grid <- expand.grid(
predFixed = c(2, 4, 6, 8),
minNode = c(1, 5, 10)
)
rf_model <- train(y ~ ., data = tree_training,
method = "Rborist",
trControl = rf_ctrl,
tuneGrid = rf_grid,
metric = "sens"
)
## Aggregating results
## Selecting tuning parameters
## Fitting predFixed = 4, minNode = 5 on full training set
rf_predictions <- predict(rf_model, tree_testing)
rf_probabilities <- predict(rf_model, tree_testing, type = "prob")
rf_eval_data <- data.frame(pred = rf_predictions, obs = tree_testing$y, yes = rf_probabilities$yes, no = rf_probabilities$no)
rf_summary <- imbalanced_summary(data = rf_eval_data)
By virtually all metrics the tree based models outperformed the SVM models, with the radial SVM model managing to achieve the highest specificity. While not managing to lead by any other metrics, the SVM models and especially the radial model manage to perform very similarly to the tree based models in most of them, achieving AUC-ROC, specificity, precision, F1, and G-Mean scores on par with and sometimes exceeding one of the tree based models. This performance belies some serious shortfalls in the SVM models however. The composite metrics are inflated by the high specificity of the SVM models, concealing sub par performance in recall. This imbalanced performance is most evident in the AUC-PR metric as one may expect. The SVM models significantly under perform the tree based models by this metric, with it seeing the largest performance gap between the model classes of any of the tracked metrics. As the performance gap in specificity is relatively small and the value of correctly capturing the positive cases far outweighs the cost of mislabeling the negative case it is not possible to recommend the SVM models over either tree model. Based on the results of this analysis the Random Forest model would be the best recommendation as it has a negligible difference in recall and AUC-PR compared to XGBoost while achieving slightly better specificity and precision. In practice either tree model would be a reasonable choice and it should be noted that all models were handicapped due to the necessity of downsampling, a low number of cross validation folds, and low range of tuning parameters used in this analysis. These limitations were due to the limitations of the hardware available, and it is possible that improvements in performance which better separate the models can be achieved if this limitation was lifted.
results_df <- data.frame(linear_summary, radial_summary, xgb_summary, rf_summary) |>
rename(`Linear SVM` = linear_summary, `Radial SVM` = radial_summary, `XGBoost` = xgb_summary, `Random Forest` = rf_summary)
results_df |>
mutate(across(where(is.numeric), \(x) round(x, 3))) |>
t() |>
as.data.frame() |>
rownames_to_column(var = "Model") |>
mutate(
`AUC-ROC` = cell_spec(
`AUC-ROC`,
background = ifelse(`AUC-ROC` == max(`AUC-ROC`, na.rm = TRUE), "#FFFF99", "white")
),
`AUC-PR` = cell_spec(
`AUC-PR`,
background = ifelse(`AUC-PR` == max(`AUC-PR`, na.rm = TRUE), "#FFFF99", "white")
),
Recall = cell_spec(
Recall,
background = ifelse(Recall == max(Recall, na.rm = TRUE), "#FFFF99", "white")
),
Specificity = cell_spec(
Specificity,
background = ifelse(Specificity == max(Specificity, na.rm = TRUE), "#FFFF99", "white")
),
Precision = cell_spec(
Precision,
background = ifelse(Precision == max(Precision, na.rm = TRUE), "#FFFF99", "white")
),
`F1` = cell_spec(
`F1`,
background = ifelse(`F1` == max(`F1`, na.rm = TRUE), "#FFFF99", "white")
),
GMean = cell_spec(
GMean,
background = ifelse(GMean == max(GMean, na.rm = TRUE), "#FFFF99", "white")
)
) |>
kbl(caption = "Performance of Optimal Models on the Test Set", escape = FALSE) |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Model | AUC-ROC | AUC-PR | Recall | Specificity | Precision | F1 | GMean |
|---|---|---|---|---|---|---|---|
| Linear SVM | 0.785 | 0.397 | 0.605 | 0.864 | 0.361 | 0.452 | 0.723 |
| Radial SVM | 0.785 | 0.408 | 0.622 | 0.868 | 0.373 | 0.467 | 0.734 |
| XGBoost | 0.81 | 0.456 | 0.648 | 0.857 | 0.365 | 0.467 | 0.745 |
| Random Forest | 0.801 | 0.45 | 0.649 | 0.864 | 0.377 | 0.477 | 0.749 |