# Load packages
suppressWarnings(suppressMessages({library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(ggmosaic)
library(caret)
library(e1071)
library(DMwR2)
library(doParallel)
library(foreach)}))

# Load data
bank_addtl_full = read.csv("/Users/mollysiebecker/DATA 622/bank-additional-full.csv")

Data Pre-Processing

# Feature selection to improve computational efficiency 
# Eliminate 'pdays' and 'default' (near zero variance)
bank_addtl_full <- bank_addtl_full %>% select(-pdays, -default)
# Handling missing data
# Convert 'unknown' to NA
bank_addtl_full[bank_addtl_full == "unknown"] <- NA

# Mode imputation to fill in missing values
mode_job <- names(sort(table(bank_addtl_full$job), decreasing = TRUE))[1]
bank_addtl_full$job[is.na(bank_addtl_full$job)] <- mode_job

mode_marital <- names(sort(table(bank_addtl_full$marital), decreasing = TRUE))[1]
bank_addtl_full$marital[is.na(bank_addtl_full$marital)] <- mode_marital

mode_education <- names(sort(table(bank_addtl_full$education), decreasing = TRUE))[1]
bank_addtl_full$education[is.na(bank_addtl_full$education)] <- mode_education

mode_housing <- names(sort(table(bank_addtl_full$housing), decreasing = TRUE))[1]
bank_addtl_full$housing[is.na(bank_addtl_full$housing)] <- mode_housing

mode_loan <- names(sort(table(bank_addtl_full$loan), decreasing = TRUE))[1]
bank_addtl_full$loan[is.na(bank_addtl_full$loan)] <- mode_loan
# Handling categorical data

# Convert character vectors to factors
bank_addtl_full <- bank_addtl_full %>%
  mutate(across(where(is.character), as.factor))

# Relevel target variable to set 'yes' as positive class
bank_addtl_full$y <- relevel(bank_addtl_full$y, ref = "yes")
# Split data into training and testing sets
set.seed(1989)
trainIndex_80 <- createDataPartition(bank_addtl_full$y, p = 0.8, list = FALSE)
trainData_80 <- bank_addtl_full[trainIndex_80, ]
testData_20  <- bank_addtl_full[-trainIndex_80, ]

SVM Training

Linear

# Train a linear SVM using svmLinear
svm_model_linear <- train(
  y ~ .,        
  data = trainData_80,   
  method = "svmLinear", # Linear support vector machine
  trControl = trainControl(method = "cv", number = 5, sampling = "down"), # down sample to balance classes
  preProcess = c("center", "scale"),
  tuneGrid = expand.grid(C = c(0.01, 0.1, 1, 10))
)

Radial

# Train a SVM using svmRadial
svm_model_radial <- train(
  y ~ .,        
  data = trainData_80,   
  method = "svmRadial", # Linear support vector machine
  trControl = trainControl(method = "cv", number = 5, sampling = "down"), # down sample to balance classes
  preProcess = c("zv", "center", "scale"),
  tuneGrid = expand.grid(C = c(0.1, 1, 10),
                       sigma = c(0.01, 0.1, 1))
)

Polynomial

# Train a polynomial SVM using svmPoly
svm_model_polynomial <- train(
  y ~ .,        
  data = trainData_80,   
  method = "svmPoly", # Polynomial support vector machine
  trControl = trainControl(method = "cv", number = 5, sampling = "down"), # down sample to balance classes
  preProcess = c("center", "scale"),
  tuneGrid = expand.grid(C = c(1, 10),
                       degree = c(2, 3),
                       scale = c(0.001, 0.01))
)

Model Features and Parameters

Linear

# Print model and plot results of tuning
print(svm_model_linear)
## Support Vector Machines with Linear Kernel 
## 
## 32951 samples
##    18 predictor
##     2 classes: 'yes', 'no' 
## 
## Pre-processing: centered (45), scaled (45) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 26361, 26361, 26361, 26361, 26360 
## Addtional sampling using down-sampling prior to pre-processing
## 
## Resampling results across tuning parameters:
## 
##   C      Accuracy   Kappa    
##    0.01  0.8506874  0.4988768
##    0.10  0.8480471  0.4940665
##    1.00  0.8497162  0.4991691
##   10.00  0.8495038  0.4987059
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.01.
plot(svm_model_linear)

# Plot relative feature importance
plot(varImp(svm_model_linear))

Radial

# Print model and plot results of tuning
print(svm_model_radial)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 32951 samples
##    18 predictor
##     2 classes: 'yes', 'no' 
## 
## Pre-processing: centered (45), scaled (45) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 26361, 26360, 26360, 26362, 26361 
## Addtional sampling using down-sampling prior to pre-processing
## 
## Resampling results across tuning parameters:
## 
##   C     sigma  Accuracy   Kappa     
##    0.1  0.01   0.8513852  0.49036737
##    0.1  0.10   0.5839273  0.18049427
##    0.1  1.00   0.3090804  0.02996622
##    1.0  0.01   0.8486235  0.49657928
##    1.0  0.10   0.8066524  0.41005167
##    1.0  1.00   0.3618101  0.06547393
##   10.0  0.01   0.8491699  0.49410535
##   10.0  0.10   0.8022220  0.39890246
##   10.0  1.00   0.3722191  0.06979386
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01 and C = 0.1.
plot(svm_model_radial)

# Plot relative feature importance
plot(varImp(svm_model_radial))

Polynomial

# Print model and plot results of tuning
print(svm_model_polynomial)
## Support Vector Machines with Polynomial Kernel 
## 
## 32951 samples
##    18 predictor
##     2 classes: 'yes', 'no' 
## 
## Pre-processing: centered (45), scaled (45) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 26361, 26361, 26360, 26362, 26360 
## Addtional sampling using down-sampling prior to pre-processing
## 
## Resampling results across tuning parameters:
## 
##   C   degree  scale  Accuracy   Kappa    
##    1  2       0.001  0.8505962  0.4943400
##    1  2       0.010  0.8509300  0.5023723
##    1  3       0.001  0.8505353  0.4959146
##    1  3       0.010  0.8510819  0.5016706
##   10  2       0.001  0.8479256  0.4946421
##   10  2       0.010  0.8526902  0.5041915
##   10  3       0.001  0.8498679  0.4985208
##   10  3       0.010  0.8482594  0.4891450
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 2, scale = 0.01 and C = 10.
plot(svm_model_polynomial)

# Plot relative feature importance
plot(varImp(svm_model_polynomial))

Model Evaluation with Predictions and Confusion Matrices

Linear

# Make predictions
predictions_svm_linear <- predict(svm_model_linear, newdata = testData_20)

# Evaluate model
confusionMatrix(predictions_svm_linear, testData_20$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  837 1175
##        no    91 6134
##                                          
##                Accuracy : 0.8463         
##                  95% CI : (0.8383, 0.854)
##     No Information Rate : 0.8873         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.4909         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9019         
##             Specificity : 0.8392         
##          Pos Pred Value : 0.4160         
##          Neg Pred Value : 0.9854         
##              Prevalence : 0.1127         
##          Detection Rate : 0.1016         
##    Detection Prevalence : 0.2443         
##       Balanced Accuracy : 0.8706         
##                                          
##        'Positive' Class : yes            
## 

Radial

# Make predictions
predictions_svm_radial <- predict(svm_model_radial, newdata = testData_20)

# Evaluate model
confusionMatrix(predictions_svm_radial, testData_20$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  811 1123
##        no   117 6186
##                                           
##                Accuracy : 0.8495          
##                  95% CI : (0.8416, 0.8571)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4889          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.87392         
##             Specificity : 0.84635         
##          Pos Pred Value : 0.41934         
##          Neg Pred Value : 0.98144         
##              Prevalence : 0.11266         
##          Detection Rate : 0.09846         
##    Detection Prevalence : 0.23479         
##       Balanced Accuracy : 0.86014         
##                                           
##        'Positive' Class : yes             
## 

Polynomial

# Make predictions
predictions_svm_polynomial <- predict(svm_model_polynomial, newdata = testData_20)

# Evaluate model
confusionMatrix(predictions_svm_polynomial, testData_20$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  826 1195
##        no   102 6114
##                                           
##                Accuracy : 0.8425          
##                  95% CI : (0.8345, 0.8503)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4799          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8901          
##             Specificity : 0.8365          
##          Pos Pred Value : 0.4087          
##          Neg Pred Value : 0.9836          
##              Prevalence : 0.1127          
##          Detection Rate : 0.1003          
##    Detection Prevalence : 0.2454          
##       Balanced Accuracy : 0.8633          
##                                           
##        'Positive' Class : yes             
##