Importing and Preprocessing Data from Previous Assignments
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.5.2
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## alpha
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(themis)
## Warning: package 'themis' was built under R version 4.5.2
## Loading required package: recipes
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(PRROC)
## Loading required package: rlang
##
## Attaching package: 'rlang'
##
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
set.seed(24601)
data <- read.csv('https://raw.githubusercontent.com/ddebonis47/classwork/refs/heads/main/bank-full.csv', sep = ';', stringsAsFactors = TRUE)
# all copied and pasted from previous assignment
median_prev <- median(data$previous, na.rm = TRUE)
data$previous <- ifelse(data$previous == 275, median_prev, data$previous)
data$pdays <- as.numeric(data$pdays)
data$pdays[is.na(data$pdays)] <- -1
data <- data |>
mutate(poutcome=na_if(poutcome, "unknown"))
data <- data |>
mutate(job=na_if(job, "unknown"))
data <- data |>
mutate(education=na_if(education, "unknown"))
data <- data |>
mutate(contact=na_if(contact, "unknown"))
data$y <- factor(data$y, levels = c("no", "yes"))
### changed coding of date based on feedback
data <- data |>
mutate(
month_factor = factor(month, levels = tolower(month.abb), ordered = TRUE),
# Assign quarters based on month
quarter = case_when(
month_factor %in% c("jan","feb","mar") ~ "Q1",
month_factor %in% c("apr","may","jun") ~ "Q2",
month_factor %in% c("jul","aug","sep") ~ "Q3",
month_factor %in% c("oct","nov","dec") ~ "Q4"
),
# Assign seasons
season = case_when(
month_factor %in% c("dec","jan","feb") ~ "Winter",
month_factor %in% c("mar","apr","may") ~ "Spring",
month_factor %in% c("jun","jul","aug") ~ "Summer",
month_factor %in% c("sep","oct","nov") ~ "Fall"
),
# Bucket day into early/mid/late month
day_group = case_when(
day <= 10 ~ "Early",
day <= 20 ~ "Mid",
TRUE ~ "Late"
)
)
### ensuring removal of duration
data <- data |>
select(-duration)
data$total_contacts <- data$campaign + data$previous
data$contact_ratio <- data$campaign / pmax(data$previous, 1)
data$contacts_per_campaign <- with(data,
ifelse((campaign + previous) > 0,
campaign / (campaign + previous),
NA))
data$contacts_per_campaign[is.na(data$contacts_per_campaign)] <- 0
# Columns where unknown or NA should be a single level
cat_vars <- c("poutcome","job","education","contact","quarter","season","day_group")
for(col in cat_vars){
data[[col]] <- as.character(data[[col]])
data[[col]][is.na(data[[col]])] <- "unknown" # replace NAs
data[[col]][data[[col]]=="unknown"] <- "unknown" # replace any lingering "unknown"
data[[col]] <- factor(data[[col]]) # convert back to factor
}
# Subset to 5,000 rows (optional for speed)
data_subset <- data[sample(1:nrow(data), 5000), ]
# Train/Test Split
trainIndex <- createDataPartition(data_subset$y, p = 0.8, list = FALSE)
trainData <- data_subset[trainIndex, ]
testData <- data_subset[-trainIndex, ]
# Evaluation function
evaluate_model <- function(model, testData, positive_class = "yes") {
# Predicted probabilities
probs <- predict(model, newdata = testData, type = "prob")
# Predicted labels
preds <- predict(model, newdata = testData)
# ROC AUC
roc_obj <- roc(response = testData$y, predictor = probs[, positive_class])
roc_auc <- auc(roc_obj)
# PR AUC
fg <- probs[, positive_class]
bg <- probs[, setdiff(colnames(probs), positive_class)]
pr_obj <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
pr_auc <- pr_obj$auc.integral
# Confusion Matrix metrics
cm <- confusionMatrix(preds, testData$y, positive = positive_class)
precision <- cm$byClass["Precision"]
recall <- cm$byClass["Recall"]
f1 <- cm$byClass["F1"]
# Accuracy
accuracy <- mean(preds == testData$y)
data.frame(
Model = model$method,
Accuracy = round(accuracy, 4),
ROC_AUC = round(roc_auc, 4),
PR_AUC = round(pr_auc, 4),
Precision = round(precision, 4),
Recall = round(recall, 4),
F1 = round(f1, 4)
)
}
# Function to train all kernels
run_svm_models <- function(trainData, testData, use_smote = FALSE) {
ctrl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 2,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = TRUE,
sampling = if(use_smote) "smote" else NULL
)
# Linear-like SVM (radial with tiny sigma)
svm_linear_like <- train(
y ~ ., data = trainData,
method = "svmRadial",
metric = "ROC",
preProcess = c("center","scale"),
trControl = ctrl,
tuneGrid = expand.grid(
sigma = 0.0001,
C = 2^(-1:2)
)
)
# Polynomial SVM
svm_poly <- train(
y ~ ., data = trainData,
method = "svmPoly",
metric = "ROC",
preProcess = c("center","scale"),
trControl = ctrl,
tuneGrid = expand.grid(
degree = c(2,3),
scale = c(0.001,0.01,0.1),
C = 2^(-1:2)
)
)
# RBF SVM
svm_rbf <- train(
y ~ ., data = trainData,
method = "svmRadial",
metric = "ROC",
preProcess = c("center","scale"),
trControl = ctrl,
tuneLength = 5
)
# Evaluate all models
models <- list(LinearLike = svm_linear_like, Poly = svm_poly, RBF = svm_rbf)
results_metrics <- do.call(rbind, lapply(models, evaluate_model, testData))
# Print SMOTE info
if(use_smote){
cat("\n--- SVM Results with SMOTE ---\n")
} else {
cat("\n--- SVM Results without SMOTE ---\n")
}
print(results_metrics)
# Resample comparison
resamp_results <- resamples(models)
bwplot(resamp_results, metric = "ROC", main = if(use_smote) "ROC Comparison with SMOTE" else "ROC Comparison without SMOTE")
return(list(models = models, metrics = results_metrics))
}
# ---- Run both scenarios ----
res_no_smote <- run_svm_models(trainData, testData, use_smote = FALSE)
## line search fails -0.2733741 1.554954 2.291108e-05 -5.195745e-06 -1.038076e-08 3.984548e-09 -2.585372e-13
## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs
## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## line search fails -0.2755675 1.516934 1.942097e-05 -4.672667e-06 -9.122875e-09 3.108634e-09 -1.917007e-13
## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs
## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## line search fails -0.2796036 1.528292 1.605331e-05 -3.55773e-06 -5.924931e-09 4.446595e-09 -1.109345e-13
## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs
## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
##
## --- SVM Results without SMOTE ---
## Model Accuracy ROC_AUC PR_AUC Precision Recall F1
## LinearLike svmRadial 0.8929 0.7600 0.3105 0.6383 0.2500 0.3593
## Poly svmPoly 0.8949 0.7541 0.3093 0.6829 0.2333 0.3478
## RBF svmRadial 0.8949 0.7428 0.3200 0.6744 0.2417 0.3558
res_smote <- run_svm_models(trainData, testData, use_smote = TRUE)
## line search fails -1.698917 -0.2102793 1.346651e-05 6.599915e-06 -4.14791e-08 -1.255909e-08 -6.414677e-13
## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs
## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## line search fails -0.9429723 0.5497187 1.5085e-05 2.836673e-07 -9.4511e-09 4.415262e-09 -1.413173e-13
## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs
## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## line search fails -1.551299 -0.2941999 1.151151e-05 5.9291e-06 -2.646198e-08 -1.591249e-08 -3.989642e-13
## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs
## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## line search fails -1.663898 -0.2526609 1.154468e-05 5.371295e-06 -2.766122e-08 -1.427312e-08 -3.96005e-13
## Warning in method$predict(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class prediction calculations failed; returning NAs
## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
##
## --- SVM Results with SMOTE ---
## Model Accuracy ROC_AUC PR_AUC Precision Recall F1
## LinearLike svmRadial 0.7998 0.7713 0.3956 0.3148 0.5667 0.4048
## Poly svmPoly 0.8158 0.7642 0.3969 0.3333 0.5333 0.4103
## RBF svmRadial 0.8779 0.7664 0.3259 0.4909 0.4500 0.4696
# Define hyperparameter columns
hyper_cols <- c("sigma", "C", "degree", "scale")
get_hyper <- function(model, hyper_cols){
hp <- model$bestTune
# Ensure only one row (best overall)
if(nrow(hp) > 1) hp <- hp[1, ]
# Add missing columns as NA
for(col in hyper_cols){
if(!col %in% names(hp)) hp[[col]] <- NA
}
# Order columns
hp <- hp[hyper_cols]
return(hp)
}
# Summaries for both cases
summary_no <- do.call(rbind, lapply(names(res_no_smote$models), function(name) {
metric_row <- evaluate_model(res_no_smote$models[[name]], testData)
hp <- get_hyper(res_no_smote$models[[name]], hyper_cols)
cbind(metric_row, hp, SMOTE = "No")
}))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
summary_yes <- do.call(rbind, lapply(names(res_smote$models), function(name) {
metric_row <- evaluate_model(res_smote$models[[name]], testData)
hp <- get_hyper(res_smote$models[[name]], hyper_cols)
cbind(metric_row, hp, SMOTE = "Yes")
}))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Combine into one table
summary_table <- rbind(summary_no, summary_yes)
# Reorder columns for readability
summary_table <- summary_table[, c("Model", "SMOTE", hyper_cols,
"Accuracy", "ROC_AUC", "PR_AUC",
"Precision", "Recall", "F1")]
print(summary_table)
## Model SMOTE sigma C degree scale Accuracy ROC_AUC
## Precision svmRadial No 0.00010000 0.50 NA NA 0.8929 0.7600
## Precision1 svmPoly No NA 1.00 2 0.001 0.8949 0.7541
## Precision2 svmRadial No 0.01164917 0.25 NA NA 0.8949 0.7428
## Precision3 svmRadial Yes 0.00010000 4.00 NA NA 0.7998 0.7713
## Precision11 svmPoly Yes NA 2.00 3 0.001 0.8158 0.7642
## Precision21 svmRadial Yes 0.01121745 0.25 NA NA 0.8779 0.7664
## PR_AUC Precision Recall F1
## Precision 0.3105 0.6383 0.2500 0.3593
## Precision1 0.3093 0.6829 0.2333 0.3478
## Precision2 0.3200 0.6744 0.2417 0.3558
## Precision3 0.3956 0.3148 0.5667 0.4048
## Precision11 0.3969 0.3333 0.5333 0.4103
## Precision21 0.3259 0.4909 0.4500 0.4696