Assignment-3-622_Final.knit

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(e1071)  # for SVM

## Warning: package 'e1071' was built under R version 4.4.2

library(pROC)

## Warning: package 'pROC' was built under R version 4.4.2

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

bank_data <- read.csv("https://raw.githubusercontent.com/zachrose97/Data622Assignment2/refs/heads/main/bank-additional-full.csv", sep = ";")

bank_data <- bank_data %>% select(-duration)
bank_data <- bank_data %>% mutate(across(where(is.character), as.factor))

num_vars <- sapply(bank_data, is.numeric)
bank_data[num_vars] <- scale(bank_data[num_vars])

set.seed(123)
train_index <- createDataPartition(bank_data$y, p = 0.7, list = FALSE)
train_data <- bank_data[train_index, ]
test_data  <- bank_data[-train_index, ]

svm_model_linear <- svm(y ~ ., data = train_data, kernel = "linear", probability = TRUE)

pred_linear <- predict(svm_model_linear, test_data)
pred_probs_linear <- attr(predict(svm_model_linear, test_data, probability = TRUE), "probabilities")[, "yes"]

conf_matrix_linear <- confusionMatrix(pred_linear, test_data$y)
print(conf_matrix_linear)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  10825  1137
##        yes   139   255
##                                          
##                Accuracy : 0.8967         
##                  95% CI : (0.8912, 0.902)
##     No Information Rate : 0.8873         
##     P-Value [Acc > NIR] : 0.000444       
##                                          
##                   Kappa : 0.2482         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9873         
##             Specificity : 0.1832         
##          Pos Pred Value : 0.9049         
##          Neg Pred Value : 0.6472         
##              Prevalence : 0.8873         
##          Detection Rate : 0.8761         
##    Detection Prevalence : 0.9681         
##       Balanced Accuracy : 0.5853         
##                                          
##        'Positive' Class : no             
##

roc_linear <- roc(test_data$y, pred_probs_linear, levels = c("no", "yes"), direction = "<")
auc_linear <- auc(roc_linear)
print(paste("Linear SVM AUC:", round(auc_linear, 4)))

## [1] "Linear SVM AUC: 0.6828"

set.seed(123)
sampled_train <- train_data[sample(nrow(train_data), 5000), ]

tuned_svm_radial <- tune(
  svm,
  y ~ .,
  data = sampled_train,
  kernel = "radial",
  ranges = list(cost = c(0.1, 1, 10)),
  probability = TRUE
)

svm_model_radial <- tuned_svm_radial$best.model
print(tuned_svm_radial$best.parameters)

##   cost
## 2    1

pred_radial <- predict(svm_model_radial, test_data)
pred_probs_radial <- attr(predict(svm_model_radial, test_data, probability = TRUE), "probabilities")[, "yes"]

conf_matrix_radial <- confusionMatrix(pred_radial, test_data$y)
print(conf_matrix_radial)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  10876  1158
##        yes    88   234
##                                           
##                Accuracy : 0.8992          
##                  95% CI : (0.8937, 0.9044)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : 1.321e-05       
##                                           
##                   Kappa : 0.2409          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9920          
##             Specificity : 0.1681          
##          Pos Pred Value : 0.9038          
##          Neg Pred Value : 0.7267          
##              Prevalence : 0.8873          
##          Detection Rate : 0.8802          
##    Detection Prevalence : 0.9739          
##       Balanced Accuracy : 0.5800          
##                                           
##        'Positive' Class : no              
##

roc_radial <- roc(test_data$y, pred_probs_radial, levels = c("no", "yes"), direction = "<")
auc_radial <- auc(roc_radial)
print(paste("Radial SVM AUC:", round(auc_radial, 4)))

## [1] "Radial SVM AUC: 0.6863"

results_table_svm <- data.frame(
  Model = c("SVM (Linear)", "SVM (Radial)"),
  Accuracy = c(conf_matrix_linear$overall["Accuracy"], conf_matrix_radial$overall["Accuracy"]),
  Kappa = c(conf_matrix_linear$overall["Kappa"], conf_matrix_radial$overall["Kappa"]),
  Sensitivity = c(conf_matrix_linear$byClass["Sensitivity"], conf_matrix_radial$byClass["Sensitivity"]),
  Specificity = c(conf_matrix_linear$byClass["Specificity"], conf_matrix_radial$byClass["Specificity"]),
  BalancedAccuracy = c(conf_matrix_linear$byClass["Balanced Accuracy"], conf_matrix_radial$byClass["Balanced Accuracy"]),
  AUC = c(auc_linear, auc_radial)
)

library(knitr)
kable(results_table_svm, caption = "SVM Model Performance Summary")

SVM Model Performance Summary
Model	Accuracy	Kappa	Sensitivity	Specificity	BalancedAccuracy	AUC
SVM (Linear)	0.8967303	0.2481848	0.9873221	0.1831897	0.5852559	0.6827837
SVM (Radial)	0.8991583	0.2409143	0.9919737	0.1681034	0.5800386	0.6862897

In this assignment, I extended the machine learning analysis conducted in the previous assignment by introducing Support Vector Machines as a fourth algorithm. This analysis supports applications in customer segmentation and marketing analytics, where accurate classification of potential conversions is a necessity. The aim was to evaluate the performance of SVM on the same dataset used in Homework 2, and to compare its results against the Decision Tree, Random Forest, and AdaBoost models already tested. The dataset consisted of client demographic and campaign features related to term deposit subscriptions, and the objective remained to accurately classify whether a client would subscribe to the product or not. In keeping with the previous methodology, I used the same train test split and preprocessing steps, including converting character columns to factors and scaling the numeric columns. The evaluation metrics remained the same: accuracy, kappa, sensitivity, specificity, balanced accuracy, and the area under the ROC curve. To begin the SVM analysis, I trained a support vector classifier using a linear kernel. The model was trained on the same 70 percent training set used previously, and predictions were made on the 30 percent holdout test set. The results showed that the linear SVM achieved an accuracy of 89.67 percent, a kappa score of 0.2482, sensitivity of 98.73 percent, and specificity of 18.32 percent. The balanced accuracy was 58.53 percent, and the AUC was calculated to be 0.6828. I then trained a second SVM model using a radial kernel, tuning the cost parameter on a sample of 5000 training rows to reduce runtime. This model achieved an accuracy of 89.92 percent, a kappa score of 0.2409, sensitivity of 99.20 percent, and specificity of 16.81 percent. Its balanced accuracy was 58.00 percent, and the AUC was slightly improved at 0.6863. Overall, the differences between the two kernels were minor. While both SVM models showed high sensitivity and similar accuracy, they struggled with class separation, as indicated by their relatively low AUC and specificity values.

When compared directly to the other algorithms from Homework 2, neither SVM model outperformed the ensemble methods. The decision tree model had comparable accuracy and a higher AUC of 0.7077, while Random Forest achieved better balance and an AUC of 0.7867. AdaBoost remained the top-performing model, achieving an AUC of 0.8082 and a balanced accuracy of 61.75 percent. Both SVMs performed similarly to Decision Tree in most categories but were less balanced overall, specifically when it came to correctly identifying the minority class. These findings suggest that while SVMs offer consistent results, they are less effective than alternative methods in handling the class imbalance present in this dataset.

These observations are reflected in existing literature. Zriqat, Altamimi, and Azzeh (2016) compared multiple classification models for heart disease prediction and found that Decision Trees significantly outperformed SVMs, achieving 99 percent accuracy compared to 76 percent for SVMs. In contrast, Guhathakurata, Kundu, Chakraborty, and Banerjee (2021) demonstrated that SVMs achieved superior predictive performance over Decision Trees in classifying COVID-19 cases, particularly in terms of sensitivity and AUC. A similar outcome was found in the domain of satellite image classification, where Shafri and Ramle (2009) reported that SVMs outperformed Decision Trees due to their ability to handle high-dimensional, nonlinear feature spaces. These studies suggest that while SVMs can be powerful, especially in domains with structured complexity or imbalanced classes, their performance varies depending on context and dataset structure.

In conclusion, the addition of SVM to the experiment provided a useful benchmark and confirmed the strength of the previously selected models. AdaBoost remains the recommended model for deployment due to its superior AUC, balanced accuracy, and generalization ability. While SVM delivered solid results and may be useful in other domains, it is not the most effective choice for this particular classification task. These results, together with the findings from academic literature, reinforce the value of ensemble methods in real-world marketing and customer segmentation applications where class imbalance and predictive precision are critical.

References

Zriqat, I. A., Altamimi, A. M., & Azzeh, M. Y. (2016). A comparative study for predicting heart diseases using data mining classification methods. International Journal of Computer Science and Information Security (IJCSIS), 14(12), 1–5. arxiv.org/abs/1704.02799

Guhathakurata, S., Kundu, S., Chakraborty, A., & Banerjee, J. S. (2021). A novel approach to predict COVID-19 using support vector machine. Complexity, 2021, 1–9. hindawi.com/journals/complexity/2021/5550344/

Shafri, H. Z. M., & Ramle, F. S. H. (2009). A comparison of support vector machine and decision tree classifications using satellite data of Langkawi Island. Information Technology Journal, 8(1), 64–70. scialert.net/fulltext/?doi=itj.2009.64.70