Data 622 Assignment 3: Support Vector Machines

Objectives

1. Read the following articles:
    https://www.hindawi.com/journals/complexity/2021/5550344/
    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8137961/
2. Search for academic content (at least 3 articles) that compare the use of decision trees vs SVMs in your current area of expertise.
3. Perform an analysis of the dataset used in Homework #2 using the SVM algorithm.
4. Compare the results with the results from previous homework.
5. Answer questions, such as:
    Which algorithm is recommended to get more accurate results?
    Is it better for classification or regression scenarios?
    Do you agree with the recommendations?
    Why?

Load Libraries:

Below are the libraries used to complete this assignment

library(tidyverse)
library(dplyr)
library(tidyr)
library(rpart)
library(rpart.plot)
library(lubridate)
library(skimr)
library(stringr)
library(corrplot)
library(ggplot2)
library(fpp3)
library(caret)
library(highcharter)
library(dplyr)
library(randomForest)
library(ROCR)           
library(pROC)
library(knitr)
library(kableExtra)
library(e1071) # For SVM

The data used in Assignment2 was Bank Marketing Dataset.

A Portuguese bank conducted a marketing campaign (phone calls) to predict if a client will subscribe to a term deposit. The records of their efforts are available in the form of a dataset. Bank Marketing Dataset can be download from: https://archive.ics.uci.edu/dataset/222/bank+marketing

set.seed(123)
bank<- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-622/refs/heads/main/bank-full.csv",sep=";")

Data Preparation

We will do the same steps we did in the previous homework to prepare the data for the modeling. In addition to the previous preprocessing steps, data scaling was added.

# Replace "unknown" with NA
bank <- bank %>% mutate_all(~ifelse(. == "unknown", NA, .))

# Handle missing values 
for (col in names(bank)) {
  if (is.factor(bank[[col]])) {
    mode_val <- names(sort(table(bank[[col]]), decreasing = TRUE))[1]
    bank[[col]][is.na(bank[[col]])] <- mode_val
  }
}

# Convert categorical variables to factors
bank <- data.frame(lapply(bank, function(x) if(is.character(x)) factor(x) else x))

# Feature Engineering: Creating age_group
bank$age_group <- cut(bank$age, breaks = c(17, 24, 34, 44, 54, 64, 100),
                      labels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+"))

# Create a new feature based on call duration 
bank <- bank %>% mutate(long_call = if_else(duration > median(duration, na.rm = TRUE), "yes", "no"))

# Feature Engineering: Creating balance_group (income_group)
bank$balance_group <- ifelse(bank$balance <= 500, "low",
                             ifelse(bank$balance <= 2000, "medium", "high"))

# Convert new features to factors
bank$age_group <- as.factor(bank$age_group)
bank$balance_group <- as.factor(bank$balance_group)
bank$long_call <- as.factor(bank$long_call)

#Remove remaining rows with any NA values to avoid errors
bank <- na.omit(bank)
print(summary(bank))

##       age                 job           marital         education    default   
##  Min.   :18.00   management :1753   divorced: 887   primary  :1012   no :7786  
##  1st Qu.:32.00   blue-collar:1537   married :4501   secondary:4197   yes:  56  
##  Median :38.00   technician :1289   single  :2454   tertiary :2633             
##  Mean   :40.78   admin.     :1057                                              
##  3rd Qu.:47.00   services   : 682                                              
##  Max.   :89.00   retired    : 458                                              
##                  (Other)    :1066                                              
##     balance      housing     loan           contact          day       
##  Min.   :-1884   no :2900   no :6753   cellular :7257   Min.   : 1.00  
##  1st Qu.:  162   yes:4942   yes:1089   telephone: 585   1st Qu.: 7.00  
##  Median :  595                                          Median :14.00  
##  Mean   : 1552                                          Mean   :14.26  
##  3rd Qu.: 1734                                          3rd Qu.:20.00  
##  Max.   :81204                                          Max.   :31.00  
##                                                                        
##      month         duration         campaign          pdays      
##  may    :2436   Min.   :   5.0   Min.   : 1.000   Min.   :  1.0  
##  nov    :1093   1st Qu.: 113.0   1st Qu.: 1.000   1st Qu.:133.0  
##  apr    :1075   Median : 194.0   Median : 2.000   Median :195.0  
##  feb    : 881   Mean   : 261.3   Mean   : 2.064   Mean   :223.3  
##  aug    : 493   3rd Qu.: 324.0   3rd Qu.: 2.000   3rd Qu.:326.0  
##  jan    : 472   Max.   :2219.0   Max.   :16.000   Max.   :871.0  
##  (Other):1392                                                    
##     previous          poutcome      y        age_group    long_call 
##  Min.   :  1.000   failure:4679   no :6056   18-24: 157   no :3652  
##  1st Qu.:  1.000   other  :1750   yes:1786   25-34:2602   yes:4190  
##  Median :  2.000   success:1413              35-44:2592             
##  Mean   :  3.184                             45-54:1456             
##  3rd Qu.:  4.000                             55-64: 769             
##  Max.   :275.000                             65+  : 266             
##                                                                     
##  balance_group
##  high  :1725  
##  low   :3584  
##  medium:2533  
##               
##               
##               
##

head(bank,10)

##       age         job  marital education default balance housing loan   contact
## 24061  33      admin.  married  tertiary      no     882      no   no telephone
## 24063  42      admin.   single secondary      no    -247     yes  yes telephone
## 24065  33    services  married secondary      no    3444     yes   no telephone
## 24073  36  management  married  tertiary      no    2415     yes   no telephone
## 24078  36  management  married  tertiary      no       0     yes   no telephone
## 24087  44 blue-collar  married secondary      no    1324     yes   no telephone
## 24123  26  technician   single  tertiary      no     172      no  yes telephone
## 24128  51      admin.   single secondary      no    3132      no   no telephone
## 24152  33  unemployed divorced secondary      no    1005     yes   no telephone
## 24166  30      admin.  married secondary      no     873     yes   no telephone
##       day month duration campaign pdays previous poutcome   y age_group
## 24061  21   oct       39        1   151        3  failure  no     25-34
## 24063  21   oct      519        1   166        1    other yes     35-44
## 24065  21   oct      144        1    91        4  failure yes     25-34
## 24073  22   oct       73        1    86        4    other  no     35-44
## 24078  23   oct      140        1   143        3  failure yes     35-44
## 24087  25   oct      119        1    89        2    other  no     35-44
## 24123   4   nov       21        1   140        4    other  no     25-34
## 24128   5   nov      449        1   176        1  failure  no     45-54
## 24152  10   nov      175        1   174        2  failure  no     25-34
## 24166  12   nov      119        1   167        3  success  no     25-34
##       long_call balance_group
## 24061        no        medium
## 24063       yes           low
## 24065        no          high
## 24073        no          high
## 24078        no           low
## 24087        no        medium
## 24123        no           low
## 24128       yes          high
## 24152        no        medium
## 24166        no        medium

# Slit the data (70% training, 30% testing)
trainIndex <- createDataPartition(bank$y, p = 0.7, list = FALSE)
trainData <- bank[trainIndex, ]
testData <- bank[-trainIndex, ]

# Check the distribution of target variable in both sets
prop.table(table(trainData$y))

## 
##        no       yes 
## 0.7721726 0.2278274

prop.table(table(testData$y))

## 
##        no       yes 
## 0.7724373 0.2275627

# Data Scaling (Standardization)

numeric_cols <- sapply(bank, is.numeric)
preprocess_obj <- preProcess(trainData[, numeric_cols], method = c("center", "scale"))
trainData[, numeric_cols] <- predict(preprocess_obj, trainData[, numeric_cols])
testData[, numeric_cols] <- predict(preprocess_obj, testData[, numeric_cols])

The dataset is loaded, cleaned, and preprocessed to ensure all variables are correctly formatted, and any missing values are handled. Feature engineering steps were included to create additional features that may improve model performance.

We now will continue with our experiments, by Support Vector Machine Implementation.

Experiment 8: SVM with Linear Kernel

Hypothesis: Linear kernel with the default cost of 1 will be a better model for making classifications on the data than algorithms in previous assignment

The linear kernel SVM serves as a foundation model. It assumes a linear relationship between the features and the target variable (customer subscription). The model should work adequately when data exhibits linear separability yet might fail to perform well when the relationships between variables become highly non-linear.

# SVM with Linear Kernel
set.seed(123)
svm_linear <- svm(y ~ ., data=trainData, kernel="linear", probability=TRUE)

summary(svm_linear)

## 
## Call:
## svm(formula = y ~ ., data = trainData, kernel = "linear", probability = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  1919
## 
##  ( 967 952 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  no yes

svm_linear_pred <- predict(svm_linear, testData)
svm_linear_prob <- predict(svm_linear, testData, probability=TRUE)
svm_linear_cm <- confusionMatrix(svm_linear_pred, testData$y, positive="yes")
svm_linear_roc <- roc(testData$y, as.numeric(attr(svm_linear_prob, "probabilities")[,2]))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

svm_linear_cm$overall["Accuracy"]

##  Accuracy 
## 0.8434709

svm_linear_cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1690  242
##        yes  126  293
##                                           
##                Accuracy : 0.8435          
##                  95% CI : (0.8281, 0.8579)
##     No Information Rate : 0.7724          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5179          
##                                           
##  Mcnemar's Test P-Value : 2.037e-09       
##                                           
##             Sensitivity : 0.5477          
##             Specificity : 0.9306          
##          Pos Pred Value : 0.6993          
##          Neg Pred Value : 0.8747          
##              Prevalence : 0.2276          
##          Detection Rate : 0.1246          
##    Detection Prevalence : 0.1782          
##       Balanced Accuracy : 0.7391          
##                                           
##        'Positive' Class : yes             
##

The default linear SVM achieved an accuracy of approximately 84.35% and significant predictive power (p-value <2.2e-16), but with better performance on the negative class than the positive. This suggests that there is some degree of linear separability in the data. This performance provides a benchmark against which to compare the other models. Given the relatively high accuracy, a linear relationship exists between certain features and the target variable.

The default SVM with linear kernel performed a little better than default decision tree with accuracy 83.79%. Comparing the values of the confusion matrix, default SVM with linear kernel has slightly higher counts of correct predictions (1690 true negatives and 242 false positives), compare to the default decision tree model (1669 true negatives and 234 false positives)

Experiment 9: Tuned Linear Kernel SVM

Hypothesis: Tuning cost parameter will improve linear SVM performance

Our goal to enhance linear SVM performance involves adjusting the cost parameter (C) to balance the trade-off between low training error and low generalization error. A high cost parameter reduces training errors at the risk of overfitting, but a low cost parameter emphasizes a larger margin which means more training errors yet offers better generalization.

# Define the grid of cost values to test

tune_grid <- expand.grid(cost = c(0.001, 0.01, 0.1, 1, 5, 10))

# Perform grid search with cross-validation

set.seed(123)

tune_control <- tune.control(cross = 5)  # 5-fold cross-validation

svm_tune <- tune.svm(y ~ ., data = trainData, kernel = "linear", 

                     cost = tune_grid$cost,

                     tunecontrol = tune_control)

# Print the best model

print(svm_tune)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 5-fold cross validation 
## 
## - best parameters:
##  cost
##     5
## 
## - best performance: 0.1586254

# Get the best cost value

best_cost <- svm_tune$best.parameters$cost

Now that we have identified the best cost value which is 5 , we can train the SVM model using this optimized parameter. This should theoretically give us a model that generalizes better to unseen data compared to using a default or arbitrary cost value.

# Train the SVM model with the best cost

set.seed(123)

svm_linear_tuned <- svm(y ~ ., data=trainData, kernel="linear", cost=best_cost, probability=TRUE)

# Make predictions on the test data

svm_linear_pred_tuned <- predict(svm_linear_tuned, testData)

svm_linear_prob_tuned <- predict(svm_linear_tuned, testData, probability=TRUE)

# Evaluate the tuned model

svm_linear_cm_tuned <- confusionMatrix(svm_linear_pred_tuned, testData$y, positive="yes")

svm_linear_roc_tuned <- roc(testData$y, as.numeric(attr(svm_linear_prob_tuned, "probabilities")[,2]))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Print the results

print(svm_linear_cm_tuned$overall["Accuracy"])

##  Accuracy 
## 0.8421948

print(svm_linear_cm_tuned)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1688  243
##        yes  128  292
##                                           
##                Accuracy : 0.8422          
##                  95% CI : (0.8268, 0.8567)
##     No Information Rate : 0.7724          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5143          
##                                           
##  Mcnemar's Test P-Value : 3.247e-09       
##                                           
##             Sensitivity : 0.5458          
##             Specificity : 0.9295          
##          Pos Pred Value : 0.6952          
##          Neg Pred Value : 0.8742          
##              Prevalence : 0.2276          
##          Detection Rate : 0.1242          
##    Detection Prevalence : 0.1786          
##       Balanced Accuracy : 0.7377          
##                                           
##        'Positive' Class : yes             
##

Tuned model didn’t perform better, that may suggest that the initial cost value was already close to optimal, or that further tuning with a different range of cost values, or other kernels, might be necessary. Therefore we will continue with the next experiment to see if changing the model from linear to Radial Basis Function will improve the performance.The RBF kernel is a popular choice for non-linear data.

Experiment 10: Radial Kernel SVM (Default)

Hypothesis: Radial kernel will capture non-linear relationships better

The radial kernel SVM should outperform the linear kernel when feature-target relationships show non-linear patterns. The radial kernel transforms data into a high-dimensional space which simplifies the task of locating a separating hyperplane. The SVM with the radial kernel defaults might fail to capture specific non-linear patterns in the data because it is not optimally configured.

# SVM with Radial Kernel

svm_radial <- svm(y ~ ., data=trainData, kernel="radial", probability=TRUE)  

summary(svm_radial)

## 
## Call:
## svm(formula = y ~ ., data = trainData, kernel = "radial", probability = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  2090
## 
##  ( 1062 1028 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  no yes

# Make predictions on the test data

svm_radial_pred <- predict(svm_radial, testData)

svm_radial_prob <- predict(svm_radial, testData, probability=TRUE)

# Create confusion matrix

svm_radial_cm <- confusionMatrix(svm_radial_pred, testData$y, positive="yes")

# Calculate accuracy

accuracy <- sum(svm_radial_cm$table[1, 1], svm_radial_cm$table[2, 2]) / sum(svm_radial_cm$table)

cat("Accuracy of the SVM model with radial kernel:", accuracy, "\n")

## Accuracy of the SVM model with radial kernel: 0.8434709

# ROC analysis

svm_radial_roc <- roc(testData$y, as.numeric(attr(svm_radial_prob, "probabilities")[, 2]))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

The default radial SVM achieved an accuracy of approximately 84.34%. This is almost the same - slightly lower than the default linear SVM. I can assume that the default parameters for the radial kernel (specifically gamma) are not well-suited for this dataset, or that the data doesn’t have strong non-linear relationships that a radial kernel can effectively exploit without tuning.

Experiment 11: Tuned Radial Kernel SVM

Hypothesis: Tuning both cost and gamma will improve radial SVM performance

Tuning parameters of the radial kernel SVM should significantly improve its performance. By optimizing parameters the model can better capture complex non-linear relationships in the data, leading to higher accuracy and better generalization.

# Define the parameter grid for tuning
set.seed(123)
tune_grid <- expand.grid(

  C = c(0.001, 0.01, 0.1, 1, 5, 10),

  sigma = c(0.001, 0.01, 0.1, 1, 5, 10)

)

The caret package required me to name the column ‘sigma’ in the tuneGrid instead of ‘gamma’, otherwise it gave me an “Error: The tuning parameter grid should have columns sigma, C” at the tuning step.

# Set up cross-validation

fitControl <- trainControl(

  method = "cv",

  number = 5,  # Number of folds

  classProbs = TRUE,

  summaryFunction = twoClassSummary,

  savePredictions = TRUE

)

# Tune the SVM model

svm_tune <- train(

  y ~ .,

  data = trainData,

  method = "svmRadial",

  trControl = fitControl,

  tuneGrid = tune_grid,

  metric = "ROC"

)

## maximum number of iterations reached 0.003345839 0.003239638maximum number of iterations reached 0.003087249 0.003072615maximum number of iterations reached 0.001178753 0.001144268maximum number of iterations reached 0.001117506 0.001116722maximum number of iterations reached 0.005238034 0.005130426maximum number of iterations reached 0.002004685 0.001882038maximum number of iterations reached 0.0003012866 0.000301132maximum number of iterations reached 0.004213635 0.004165085maximum number of iterations reached 0.00339293 0.003181577maximum number of iterations reached 0.0003811696 0.0002045011maximum number of iterations reached 0.002511908 0.002443636maximum number of iterations reached 0.001517628 0.001514044maximum number of iterations reached 0.000233901 0.0002313684maximum number of iterations reached 0.0009662853 0.0009656456maximum number of iterations reached 0.005221456 0.005099211maximum number of iterations reached 0.000335544 0.0003353729maximum number of iterations reached 0.003285356 0.003240082maximum number of iterations reached 0.001708514 0.001621241maximum number of iterations reached 0.001465466 0.001443217maximum number of iterations reached 0.002599005 0.002590565maximum number of iterations reached 0.0002307158 0.0002280651maximum number of iterations reached 0.001229604 0.001228662maximum number of iterations reached 0.004786468 0.004768692maximum number of iterations reached 0.004245481 0.003890546maximum number of iterations reached 0.0005014895 0.0005012229maximum number of iterations reached 0.002196009 0.002180292maximum number of iterations reached 0.003082233 0.002908512maximum number of iterations reached 0.002020457 0.001985997maximum number of iterations reached 0.002211448 0.002203454maximum number of iterations reached 0.0002749228 0.000271442maximum number of iterations reached 0.0009237574 0.0009231493maximum number of iterations reached 0.004779489 0.004698025maximum number of iterations reached 0.0006040669 0.0006037402maximum number of iterations reached 0.001913937 0.00190455maximum number of iterations reached 0.001492318 0.001435047maximum number of iterations reached 0.002264345 0.002214342maximum number of iterations reached 0.002468082 0.002459588maximum number of iterations reached 0.001186509 0.001148171maximum number of iterations reached 0.0006989233 0.0006985063maximum number of iterations reached 0.005942946 0.005722294maximum number of iterations reached 4.063738e-05 3.933031e-05maximum number of iterations reached 0.0004205293 0.0004203052maximum number of iterations reached 0.002897668 0.002861916maximum number of iterations reached 0.00229662 0.002172998

# Print the best tuning parameters

print(svm_tune$bestTune)

##    sigma C
## 20  0.01 1

# Make predictions using the best model

svm_tuned_pred <- predict(svm_tune, testData)

svm_tuned_prob <- predict(svm_tune, testData, type = "prob")

# Evaluate the tuned model

svm_tuned_cm <- confusionMatrix(svm_tuned_pred, testData$y, positive = "yes")

print(svm_tuned_cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1706  256
##        yes  110  279
##                                          
##                Accuracy : 0.8443         
##                  95% CI : (0.829, 0.8588)
##     No Information Rate : 0.7724         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.51           
##                                          
##  Mcnemar's Test P-Value : 3.475e-14      
##                                          
##             Sensitivity : 0.5215         
##             Specificity : 0.9394         
##          Pos Pred Value : 0.7172         
##          Neg Pred Value : 0.8695         
##              Prevalence : 0.2276         
##          Detection Rate : 0.1187         
##    Detection Prevalence : 0.1655         
##       Balanced Accuracy : 0.7305         
##                                          
##        'Positive' Class : yes            
##

Unfortunately, Tuned Radial Kernel SVM didn’t show much improvement. Both models, Radial Kernel SVM (Default) and Tuned Radial Kernel SVM show similar accuracy 0.843 and 0.845 respectively.

plot_multiple_roc <- function(list_of_rocs, model_names) {

  plot(list_of_rocs[[1]], col = 1, main = "ROC Curves Comparison")

  for(i in 2:length(list_of_rocs)) {

    lines(list_of_rocs[[i]], col = i)

  }

  legend("bottomright", legend = model_names, col = 1:length(list_of_rocs), lwd = 2)

}

# Store ROC objects

roc_list <- list(

  svm_linear_roc,

  svm_linear_roc_tuned,

  svm_radial_roc,

  roc(testData$y, svm_tuned_prob[,"yes"])

)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Plot ROC curves

plot_multiple_roc(roc_list, 

                 c("Linear SVM", "Tuned Linear SVM", 

                   "Radial SVM", "Tuned Radial SVM"))

performance_metrics <- data.frame(

  Model = c("Linear SVM", "Tuned Linear SVM", 

            "Radial SVM", "Tuned Radial SVM"),

  Accuracy = c(svm_linear_cm$overall['Accuracy'],

              svm_linear_cm_tuned$overall['Accuracy'],

              svm_radial_cm$overall['Accuracy'],

              svm_tuned_cm$overall['Accuracy']),

  Precision = c(svm_linear_cm$byClass['Pos Pred Value'],

                svm_linear_cm_tuned$byClass['Pos Pred Value'],

                svm_radial_cm$byClass['Pos Pred Value'],

                svm_tuned_cm$byClass['Pos Pred Value']),

  Recall = c(svm_linear_cm$byClass['Sensitivity'],

             svm_linear_cm_tuned$byClass['Sensitivity'],

             svm_radial_cm$byClass['Sensitivity'],

             svm_tuned_cm$byClass['Sensitivity']),

  F1_Score = c(svm_linear_cm$byClass['F1'],

               svm_linear_cm_tuned$byClass['F1'],

               svm_radial_cm$byClass['F1'],

               svm_tuned_cm$byClass['F1'])

)

# Visualize performance metrics

performance_long <- gather(performance_metrics, 

                         Metric, Value, -Model)

ggplot(performance_long, aes(x = Model, y = Value, fill = Metric)) +

  geom_bar(stat = "identity", position = "dodge") +

  theme_minimal() +

  labs(title = "Performance Comparison of SVM Models",

       y = "Score", x = "Model") +

  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Performance Metrics Table Creation

performance_metrics <- data.frame(

    Model = c("SVM Linear", "SVM Tuned Linear", "SVM Radial", "SVM Tuned Radial"),

    Accuracy = c(svm_linear_cm$overall['Accuracy'], svm_linear_cm_tuned$overall['Accuracy'], svm_radial_cm$overall['Accuracy'], svm_tuned_cm$overall['Accuracy']),

    Sensitivity = c(svm_linear_cm$byClass['Sensitivity'], svm_linear_cm_tuned$byClass['Sensitivity'], svm_radial_cm$byClass['Sensitivity'], svm_tuned_cm$byClass['Sensitivity']),

    Specificity = c(svm_linear_cm$byClass['Specificity'], svm_linear_cm_tuned$byClass['Specificity'], svm_radial_cm$byClass['Specificity'], svm_tuned_cm$byClass['Specificity']),
     F1_Score = c(svm_linear_cm$byClass['F1'],

               svm_linear_cm_tuned$byClass['F1'],

               svm_radial_cm$byClass['F1'],

               svm_tuned_cm$byClass['F1'])

)

# Display Performance Metrics Table

kable(performance_metrics, format = "html") %>%

  kableExtra::kable_styling(full_width = F)

## Warning: 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")

Model	Accuracy	Sensitivity	Specificity	F1_Score
SVM Linear	0.8434709	0.5476636	0.9306167	0.6142558
SVM Tuned Linear	0.8421948	0.5457944	0.9295154	0.6115183
SVM Radial	0.8434709	0.5121495	0.9410793	0.5982533
SVM Tuned Radial	0.8443216	0.5214953	0.9394273	0.6038961

The accuracy values of all four models fall between 0.842 and 0.845, showing that they classify nearly identical percentages of instances correctly. Sensitivity shows greater variability between the models than other performance metrics. The SVM Tuned Radial model achieves a slightly better sensitivity score of 0.525 compared to other models which show sensitivity scores from 0.512 to 0.548. Sensitivity measures how well the model detects positive cases accurately.

The models maintain consistent specificity values ranging from 0.929 to 0.941 which demonstrates their strong ability to identify negative cases correctly. Selecting the appropriate model requires evaluating the priority between sensitivity and specificity for the planned use. The SVM Tuned Radial model becomes the preferred choice when identifying positive cases accurately because it shows slightly better sensitivity than other models. The small differences between models indicate that all four perform at a similar level of effectiveness.

According to previous experiments SVM models achieved less accuracy than Random Forest models with accuracy of 85% SVM models show variable sensitivity and specificity values. The radial SVM model with tuning demonstrates strong negative case identification abilities through its high specificity rating of 0.9388767. Random Forest models show superior performance over SVM models both in terms of accuracy and sensitivity but Decision Tree models achieve the least level of accuracy.

Random Forest models demonstrate superior accuracy and sensitivity compared to SVM models in this analysis but the choice of the most appropriate model needs to be aligned with the unique demands and priorities of each application. For applications that require high specificity the tuned radial SVM presents a viable solution. Decision Tree models show inferior performance levels compared to other models.

Review of articles provided

Demonstrate provided articles were read by, drawing insights, summarizing articles or via comparison:

https://www.hindawi.com/journals/complexity/2021/5550344/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8137961/

The two articles present analyses which demonstrate how decision tree ensemble methods can predict Covid-19 infections from lab data by handling imbalanced datasets and emphasizing correct machine learning techniques and evaluation metrics. The study demonstrates the effectiveness of ensemble methods for imbalanced datasets and shows age as a critical factor in prediction models. The two articles acknowledge the challenge presented by imbalanced datasets in Covid-19 infection prediction. The first dataset contains 600 patient samples which demonstrates a 1:6.5.

The second dataset includes 5644 patients where positive cases account for approximately 10%. Class imbalance leads to biased models necessitating special techniques for correction. These methods demonstrate robust operation and accurate outcomes when applied to unbalanced datasets. The evaluation metrics accuracy, precision, recall, F1-measure, AUC-ROC and AUPRC were employed in both studies and results demonstrate that classifiers designed for imbalanced data sets achieve superior outcomes. Balanced random forest (RUS) outperformed other methods according to AUPRC metrics while RUSBagging yielded superior AUROC results. Merging age information with laboratory test data enhances predictive accuracy. Studies failed to achieve high accuracy estimates because they ignored age as a significant factor.

https://medium.com/@jangdaehan1/svm-versus-decision-trees-a-comparative-analysis-in-supervised-learning-07e6fcc14ecd

This analytical piece reviews Support Vector Machines (SVM) and Decision Trees by examining their methods and benefits while addressing their challenges and practical use cases in supervised learning. Support Vector Machines perform well in high-dimensional data spaces and provide strong resistance to overfitting whereas Decision Trees provide clear interpretability and user-friendly application despite being susceptible to overfitting. The discussion presents performance comparisons along with contextual application significance while emphasizing the vital need for informed algorithm selection in the evolving artificial intelligence domain.

https://www.coursera.org/articles/difference-between-svm-and-decision-tree

This article examines how Support Vector Machines (SVMs) and decision trees function as machine learning models for data classification and describes their respective mechanisms while assessing their benefits and challenges and practical applications. Support Vector Machines function well in spaces with many dimensions and offer versatility through various kernel functions whereas decision trees provide easy comprehension alongside flexibility with diverse data types and can be applied to classification and regression problems. The selection process between SVMs and decision trees should be based on the specific requirements of a project and its intended application.

https://scialert.net/fulltext/?doi=itj.2009.64.70

The study compares how well Support Vector Machines (SVM) and Decision Tree (DT) methods classify satellite imagery data from Langkawi Island in terms of accuracy. In this image classification task the SVM Radial Basis function demonstrated superior performance with overall accuracy of 76.0004% compared to Decision Tree method which achieved 68.7846%.

Researchers implemented Decision Tree (DT) and Support Vector Machine (SVM) algorithms to analyze SPOT 5 satellite imagery. The development of DT rules was carried out manually through analysis of Normalized Difference Vegetation Index (NDVI) and Brightness Value (BV) variables. The SVM method was implemented automatically using four kernel types: linear, polynomial, radial basis function, and sigmoid.

Essay

This assignment focused on using the Support Vector Machine (SVM) algorithm to examine the dataset from Homework #2 and then compared the results with previous assignments.

The Bank Marketing Dataset serves as the foundation for data analysis in Assignment 2. The Bank Marketing Dataset comes from a Portuguese bank campaign that predicted client term deposit subscriptions through phone call interactions. You can download the dataset which contains records of these marketing activities from https://archive.ics.uci.edu/dataset/222/bank+marketing.

At the start of the project the team prepared the dataset which was to be used for modeling. The data preparation process consisted of three steps: loading the data, cleaning it to remove inconsistencies and format variables properly, and pre-processing to handle missing values. During the data preparation phase feature engineering activities led to the creation of new features which might improve model performance. During the data preparation phase of this assignment I implemented data scaling subsequent to dataset splitting to avoid data leakage.

Experiment 8: SVM with Linear Kernel

Hypothesis: Linear kernel with the default cost of 1 will be a better model for making classifications on the data than algorithms in previous assignment

The svm function from the e1071 package enabled the construction of a linear kernel SVM model. This experiment tested if a linear kernel with its default cost parameter would produce superior classification outcomes in comparison to algorithms from the prior assignment.

The default SVM with linear kernel performed a little better than default decision tree with accuracy 83.79% Comparing the values of the confusion matrix, default SVM with linear kernel has slightly higher counts of correct predictions, compare to the default decision tree model.

Experiment 9: Tuned Linear Kernel SVM

Hypothesis: Tuning cost parameter will improve linear SVM performance

In the subsequent experiment aimed at enhancing the linear SVM model, a grid search accompanied by cross-validation was conducted utilizing tune.svm. The procedure involved defining a range of cost values and implementing 5-fold cross-validation to identify the optimal cost parameter.

The selected best cost value of 5 was then employed to train a refined SVM model. This model achieved an accuracy of 0.8421, which is almost the same as tuned Decision Tree accuracy of 84.17% in the previous experiment.

The Tuned SVM with Linear kernel did not surpass expectations, indicating that the initial cost value may have been near optimal, or that additional tuning with a broader spectrum of cost values or alternative kernels could be warranted. Therefore, I proceeded to the next experiment to assess whether transitioning from a linear model to a Radial Basis Function (RBF) model would enhance performance.

Experiment 10: Radial Kernel SVM (Default)

Hypothesis: Radial kernel will capture non-linear relationships better

The assumption for this experiment was that the radial kernel SVM should outperform the linear kernel when feature-target relationships show non-linear patterns. The radial kernel transforms data into a high-dimensional space which simplifies the task of locating a separating hyperplane. The SVM with the radial kernel defaults might fail to capture specific non-linear patterns in the data because it is not optimally configured.

Ultimately, an SVM model utilizing a radial kernel was constructed, followed by a similar evaluation process.

The default radial SVM achieved an accuracy of approximately 84.34%. This is almost the same - slightly lower than the default linear SVM 84.35% I can assume that the default parameters for the radial kernel (specifically gamma) are not well-suited for this dataset, or that the data doesn’t have strong non-linear relationships that a radial kernel can effectively exploit without tuning.

Experiment 11: Tuned Radial Kernel SVM

Hypothesis: Tuning both cost and gamma will improve radial SVM performance

The expectation from this experiment was that tuning parameters of the radial kernel SVM should significantly improve its performance. By optimizing parameters the model can better capture complex non-linear relationships in the data, leading to higher accuracy and better generalization. Unfortunately, Tuned Radial Kernel SVM didn’t show much improvement. Both models, Radial Kernel SVM (Default) and Tuned Radial Kernel SVM show similar accuracy 0.843 and 0.845 respectively.

Overall, all Four models show similar accuracy rates from 0.842 to 0.845 indicating they classify about the same percentage of cases correctly. The models show greater differences in sensitivity measurements compared to other metrics. The SVM Tuned Radial model achieves a marginally better sensitivity at 0.525 when compared to other models which show sensitivity values between 0.512 and 0.548. The sensitivity metric measures how well the model detects positive cases.

The models maintain consistent specificity levels between 0.929 and 0.941 which demonstrates their efficiency at accurately identifying negative cases. The choice of model depends on how important sensitivity compares to specificity in your specific application. When accurate identification of positive cases becomes essential, users might choose the SVM Tuned Radial model because it demonstrates slightly better sensitivity. The performance variations between the four models remain minor which demonstrates that they all deliver equivalent effectiveness.

Compare with the previous models

The default Decision Tree model shows moderate accuracy with a score of 0.837 while its sensitivity stands at 0.562 and specificity at 0.919.
The tuned Decision Tree exhibited better performance with increased accuracy at 0.841 and improved sensitivity at 0.568 when compared to its default model.
The Decision Tree model with feature selection demonstrates performance characteristics similar to those of the default Decision Tree model.
Both the default Random Forest and the Random Forest model with ntree set to 200 demonstrate high sensitivity levels of about 0.92 and achieve greater accuracy scores of 0.8455 and 0.8507 respectively than the tuned Decision Tree model.
The SVM Linear, Tuned Linear, Radial, and Tuned Radial models show different performance outcomes that average 0.84 in accuracy but demonstrate reduced sensitivity compared to the tuned Decision Tree.

Random Forest model with its default settings reached peak accuracy of 0.8545300 along with high sensitivity at 0.9207040. The Random Forest model has a lower specificity score of 0.6299065 when compared to the SVM Tuned Radial model which achieves a specificity of 0.9394

Despite achieving superior specificity performance the SVM Tuned Radial model exhibits inferior accuracy and sensitivity compared to the Random Forest model. The selection of the best model relies on the specific requirements of the application. The SVM Tuned Radial model becomes the top choice for applications where high specificity is most important. When high accuracy and sensitivity are required Random Forest model proves to be the superior option. To reach a definitive conclusion an evaluation must be conducted which takes into account false positive and false negative consequences.

Additional context and insights from the literature underline the critical role of kernel selection and hyperparameter tuning while showcasing the trade-offs that exist among various modeling approaches. The best model choice emerges from an analysis of the dataset features together with the marketing campaign objectives.

Model choice needs to align with the organization’s business objectives. The bank needs to evaluate various models and modify thresholds to enhance recall so it discovers more potential subscribers. Organizations prioritize Decision Trees for model decisions transparency even though these models may exhibit lower accuracy than complex models. Random Forests enhance their interpretability by using feature importance rankings.

library(tidyverse)
library(dplyr)
library(tidyr)
library(rpart)
library(rpart.plot)
library(lubridate)
library(skimr)
library(stringr)
library(corrplot)
library(ggplot2)
library(fpp3)
library(caret)
library(highcharter)
library(dplyr)
library(randomForest)
library(ROCR)           
library(pROC)
library(knitr)
library(kableExtra)
library(e1071) # For SVM
set.seed(123)
bank<- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-622/refs/heads/main/bank-full.csv",sep=";")


# Replace "unknown" with NA
bank <- bank %>% mutate_all(~ifelse(. == "unknown", NA, .))

# Handle missing values 
for (col in names(bank)) {
  if (is.factor(bank[[col]])) {
    mode_val <- names(sort(table(bank[[col]]), decreasing = TRUE))[1]
    bank[[col]][is.na(bank[[col]])] <- mode_val
  }
}

# Convert categorical variables to factors
bank <- data.frame(lapply(bank, function(x) if(is.character(x)) factor(x) else x))

# Feature Engineering: Creating age_group
bank$age_group <- cut(bank$age, breaks = c(17, 24, 34, 44, 54, 64, 100),
                      labels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+"))

# Create a new feature based on call duration 
bank <- bank %>% mutate(long_call = if_else(duration > median(duration, na.rm = TRUE), "yes", "no"))

# Feature Engineering: Creating balance_group (income_group)
bank$balance_group <- ifelse(bank$balance <= 500, "low",
                             ifelse(bank$balance <= 2000, "medium", "high"))

# Convert new features to factors
bank$age_group <- as.factor(bank$age_group)
bank$balance_group <- as.factor(bank$balance_group)
bank$long_call <- as.factor(bank$long_call)

#Remove remaining rows with any NA values to avoid errors
bank <- na.omit(bank)
print(summary(bank))
head(bank,10)

# Slit the data (70% training, 30% testing)
trainIndex <- createDataPartition(bank$y, p = 0.7, list = FALSE)
trainData <- bank[trainIndex, ]
testData <- bank[-trainIndex, ]

# Check the distribution of target variable in both sets
prop.table(table(trainData$y))
prop.table(table(testData$y))
# Data Scaling (Standardization)

numeric_cols <- sapply(bank, is.numeric)
preprocess_obj <- preProcess(trainData[, numeric_cols], method = c("center", "scale"))
trainData[, numeric_cols] <- predict(preprocess_obj, trainData[, numeric_cols])
testData[, numeric_cols] <- predict(preprocess_obj, testData[, numeric_cols])


# SVM with Linear Kernel
set.seed(123)
svm_linear <- svm(y ~ ., data=trainData, kernel="linear", probability=TRUE)

summary(svm_linear)
svm_linear_pred <- predict(svm_linear, testData)
svm_linear_prob <- predict(svm_linear, testData, probability=TRUE)
svm_linear_cm <- confusionMatrix(svm_linear_pred, testData$y, positive="yes")
svm_linear_roc <- roc(testData$y, as.numeric(attr(svm_linear_prob, "probabilities")[,2]))
svm_linear_cm$overall["Accuracy"]
svm_linear_cm
# Define the grid of cost values to test

tune_grid <- expand.grid(cost = c(0.001, 0.01, 0.1, 1, 5, 10))

# Perform grid search with cross-validation

set.seed(123)

tune_control <- tune.control(cross = 5)  # 5-fold cross-validation

svm_tune <- tune.svm(y ~ ., data = trainData, kernel = "linear", 

                     cost = tune_grid$cost,

                     tunecontrol = tune_control)

# Print the best model

print(svm_tune)

# Get the best cost value

best_cost <- svm_tune$best.parameters$cost
# Train the SVM model with the best cost

set.seed(123)

svm_linear_tuned <- svm(y ~ ., data=trainData, kernel="linear", cost=best_cost, probability=TRUE)

# Make predictions on the test data

svm_linear_pred_tuned <- predict(svm_linear_tuned, testData)

svm_linear_prob_tuned <- predict(svm_linear_tuned, testData, probability=TRUE)

# Evaluate the tuned model

svm_linear_cm_tuned <- confusionMatrix(svm_linear_pred_tuned, testData$y, positive="yes")

svm_linear_roc_tuned <- roc(testData$y, as.numeric(attr(svm_linear_prob_tuned, "probabilities")[,2]))

# Print the results

print(svm_linear_cm_tuned$overall["Accuracy"])

print(svm_linear_cm_tuned)

# SVM with Radial Kernel

svm_radial <- svm(y ~ ., data=trainData, kernel="radial", probability=TRUE)  

summary(svm_radial)

# Make predictions on the test data

svm_radial_pred <- predict(svm_radial, testData)

svm_radial_prob <- predict(svm_radial, testData, probability=TRUE)

# Create confusion matrix

svm_radial_cm <- confusionMatrix(svm_radial_pred, testData$y, positive="yes")

# Calculate accuracy

accuracy <- sum(svm_radial_cm$table[1, 1], svm_radial_cm$table[2, 2]) / sum(svm_radial_cm$table)

cat("Accuracy of the SVM model with radial kernel:", accuracy, "\n")

# ROC analysis

svm_radial_roc <- roc(testData$y, as.numeric(attr(svm_radial_prob, "probabilities")[, 2]))


# Define the parameter grid for tuning
set.seed(123)
tune_grid <- expand.grid(

  C = c(0.001, 0.01, 0.1, 1, 5, 10),

  sigma = c(0.001, 0.01, 0.1, 1, 5, 10)

)

# Set up cross-validation

fitControl <- trainControl(

  method = "cv",

  number = 5,  # Number of folds

  classProbs = TRUE,

  summaryFunction = twoClassSummary,

  savePredictions = TRUE

)
# Tune the SVM model

svm_tune <- train(

  y ~ .,

  data = trainData,

  method = "svmRadial",

  trControl = fitControl,

  tuneGrid = tune_grid,

  metric = "ROC"

)
# Print the best tuning parameters

print(svm_tune$bestTune)
# Make predictions using the best model

svm_tuned_pred <- predict(svm_tune, testData)

svm_tuned_prob <- predict(svm_tune, testData, type = "prob")

# Evaluate the tuned model

svm_tuned_cm <- confusionMatrix(svm_tuned_pred, testData$y, positive = "yes")

print(svm_tuned_cm)

plot_multiple_roc <- function(list_of_rocs, model_names) {

  plot(list_of_rocs[[1]], col = 1, main = "ROC Curves Comparison")

  for(i in 2:length(list_of_rocs)) {

    lines(list_of_rocs[[i]], col = i)

  }

  legend("bottomright", legend = model_names, col = 1:length(list_of_rocs), lwd = 2)

}

# Store ROC objects

roc_list <- list(

  svm_linear_roc,

  svm_linear_roc_tuned,

  svm_radial_roc,

  roc(testData$y, svm_tuned_prob[,"yes"])

)

# Plot ROC curves

plot_multiple_roc(roc_list, 

                 c("Linear SVM", "Tuned Linear SVM", 

                   "Radial SVM", "Tuned Radial SVM"))
performance_metrics <- data.frame(

  Model = c("Linear SVM", "Tuned Linear SVM", 

            "Radial SVM", "Tuned Radial SVM"),

  Accuracy = c(svm_linear_cm$overall['Accuracy'],

              svm_linear_cm_tuned$overall['Accuracy'],

              svm_radial_cm$overall['Accuracy'],

              svm_tuned_cm$overall['Accuracy']),

  Precision = c(svm_linear_cm$byClass['Pos Pred Value'],

                svm_linear_cm_tuned$byClass['Pos Pred Value'],

                svm_radial_cm$byClass['Pos Pred Value'],

                svm_tuned_cm$byClass['Pos Pred Value']),

  Recall = c(svm_linear_cm$byClass['Sensitivity'],

             svm_linear_cm_tuned$byClass['Sensitivity'],

             svm_radial_cm$byClass['Sensitivity'],

             svm_tuned_cm$byClass['Sensitivity']),

  F1_Score = c(svm_linear_cm$byClass['F1'],

               svm_linear_cm_tuned$byClass['F1'],

               svm_radial_cm$byClass['F1'],

               svm_tuned_cm$byClass['F1'])

)

# Visualize performance metrics

performance_long <- gather(performance_metrics, 

                         Metric, Value, -Model)

ggplot(performance_long, aes(x = Model, y = Value, fill = Metric)) +

  geom_bar(stat = "identity", position = "dodge") +

  theme_minimal() +

  labs(title = "Performance Comparison of SVM Models",

       y = "Score", x = "Model") +

  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Performance Metrics Table Creation

performance_metrics <- data.frame(

    Model = c("SVM Linear", "SVM Tuned Linear", "SVM Radial", "SVM Tuned Radial"),

    Accuracy = c(svm_linear_cm$overall['Accuracy'], svm_linear_cm_tuned$overall['Accuracy'], svm_radial_cm$overall['Accuracy'], svm_tuned_cm$overall['Accuracy']),

    Sensitivity = c(svm_linear_cm$byClass['Sensitivity'], svm_linear_cm_tuned$byClass['Sensitivity'], svm_radial_cm$byClass['Sensitivity'], svm_tuned_cm$byClass['Sensitivity']),

    Specificity = c(svm_linear_cm$byClass['Specificity'], svm_linear_cm_tuned$byClass['Specificity'], svm_radial_cm$byClass['Specificity'], svm_tuned_cm$byClass['Specificity']),
     F1_Score = c(svm_linear_cm$byClass['F1'],

               svm_linear_cm_tuned$byClass['F1'],

               svm_radial_cm$byClass['F1'],

               svm_tuned_cm$byClass['F1'])

)

# Display Performance Metrics Table

kable(performance_metrics, format = "html") %>%

  kableExtra::kable_styling(full_width = F)