library(tidyverse)
library(caret)
library(randomForest)
library(rpart)
library(adabag)
library(pROC)
library(ada)
library(mice)
library(e1071)
set.seed(02180)

Import data from Project 2

bank <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 622\\Assignment 1\\DATA\\bank-additional\\bank-additional-full.csv", sep = ';')

bank <- bank %>%
  mutate(across(everything(), ~ ifelse(. == "unknown", NA, .)))

bank_comp <- na.omit(bank)

Split the data

Training and testing data is split at 75%|25% split for the first default experiment

trainPart <- createDataPartition(bank_comp$y, p = 0.75, list = F)
trainDat <- bank_comp[trainPart, ]
testDat <- bank_comp[-trainPart, ]
trainDat$y <- as.factor(trainDat$y)
testDat$y <- as.factor(testDat$y)

SVM Models set up

Perform an analysis of the dataset used in Homework #2 using the SVM algorithm.

Linear Kernel

svm_lin <- svm(formula = y ~ ., data = trainDat, cost = 1, probability = T, scale = T)                 
svm_lin_pred <- predict(svm_lin, testDat)
svm_lin_prob <-predict(svm_lin, testDat, probability = T)
svm_confusion <- confusionMatrix(svm_lin_pred, testDat$y, positive = "yes")
svm_lin_accuracy <- svm_confusion$overall["Accuracy"]
svm_lin_sensitivity <-svm_confusion$byClass["Sensitivity"]
svm_lin_specificity <-svm_confusion$byClass["Specificity"]


svm_ROC_lin <- roc(testDat$y, as.numeric(attr(svm_lin_prob, "probabilities")[,"yes"]))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
svm_lin_auc <- auc(svm_ROC_lin)

Combine performance metrics into dataframe

svm_lin_results <- data.frame(
    Method = "SVM Linear Algorithm",
    AUC = as.numeric(svm_lin_auc),
    Accuracy = svm_lin_accuracy,
    Sensitivity = svm_lin_sensitivity,
    Specificity = svm_lin_specificity
)
print(svm_lin_results)
##                        Method       AUC  Accuracy Sensitivity Specificity
## Accuracy SVM Linear Algorithm 0.9197253 0.8981761   0.3578838   0.9764158

Linear Kernel (Tuned)

tune_lin_grid <- expand.grid(cost = c(0.001, 0.01, 0.1, 1, 5, 10))

tune_lin_control <- tune.svm(y ~ ., data = trainDat, kernel = "linear", 
                                 cost = tune_lin_grid$cost, tunecontrol = tune.control(cross = 2))
print(tune_lin_control)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 2-fold cross validation 
## 
## - best parameters:
##  cost
##    10
## 
## - best performance: 0.1059169

Pull best cost for the linear kernel and train the model with it

tuned_lin_cost <- tune_lin_control$best.parameters$cost

svm_lin_tune <- svm(y ~., data = trainDat, cost = tuned_lin_cost, probability = T)
svm_tune_pred <- predict(svm_lin_tune, testDat)
svm_tune_prob <-predict(svm_lin_tune, testDat, probability = T)
svm_tune_confusion <- confusionMatrix(svm_tune_pred, testDat$y, positive = "yes")
svm_tune_accuracy <- svm_tune_confusion$overall["Accuracy"]
svm_tune_sensitivity <-svm_tune_confusion$byClass["Sensitivity"]
svm_tune_specificity <-svm_tune_confusion$byClass["Specificity"]


svm_ROC_tune <- roc(testDat$y, as.numeric(attr(svm_tune_prob, "probabilities")[,"yes"]))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
svm_tune_auc <- auc(svm_ROC_tune)

Combine performance metrics into dataframe

svm_tune_results <- data.frame(
    Method = "SVM Linear Algorithm (Tuned)",
    AUC = as.numeric(svm_tune_auc),
    Accuracy = svm_tune_accuracy,
    Sensitivity = svm_tune_sensitivity,
    Specificity = svm_tune_specificity
)
print(svm_tune_results)
##                                Method       AUC  Accuracy Sensitivity
## Accuracy SVM Linear Algorithm (Tuned) 0.9215606 0.9006692   0.4045643
##          Specificity
## Accuracy   0.9725101

Basic Radial SVM

svm_rad <- svm(formula = y ~ ., 
               data = trainDat,
               kernel = "radial",
               probability = TRUE,
               scale = TRUE)

svm_rad_pred <- predict(svm_rad, testDat, type = "class")
svm_rad_prob <- predict(svm_rad, testDat, probability = TRUE)


svm_rad_confusion <- confusionMatrix(svm_rad_pred, testDat$y, positive = "yes")
svm_rad_accuracy <- svm_rad_confusion$overall["Accuracy"]
svm_rad_sensitivity <- svm_rad_confusion$byClass["Sensitivity"]
svm_rad_specificity <- svm_rad_confusion$byClass["Specificity"]


svm_rad_ROC <- roc(testDat$y, as.numeric(attr(svm_rad_prob, "probabilities")[,"yes"]))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
svm_rad_auc <- auc(svm_rad_ROC)


svm_rad_results <- data.frame(
    Method = "SVM Radial Algorithm (Standard)",
    AUC = as.numeric(svm_rad_auc),
    Sensitivity = svm_rad_sensitivity,
    Accuracy = svm_rad_accuracy,
    Specificity = svm_rad_specificity
)

Combine the two sets of metrics

combined_metrics <- rbind(svm_lin_results, svm_tune_results, svm_rad_results)
print(combined_metrics)
##                                      Method       AUC  Accuracy Sensitivity
## Accuracy               SVM Linear Algorithm 0.9197253 0.8981761   0.3578838
## Accuracy1      SVM Linear Algorithm (Tuned) 0.9215606 0.9006692   0.4045643
## Sensitivity SVM Radial Algorithm (Standard) 0.9197253 0.8981761   0.3578838
##             Specificity
## Accuracy      0.9764158
## Accuracy1     0.9725101
## Sensitivity   0.9764158