# Load packages
suppressWarnings(suppressMessages({library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(ggmosaic)
library(caret)
library(e1071)
library(DMwR2)
library(doParallel)
library(foreach)}))
# Load data
bank_addtl_full = read.csv("/Users/mollysiebecker/DATA 622/bank-additional-full.csv")
Data Pre-Processing
# Feature selection to improve computational efficiency
# Eliminate 'pdays' and 'default' (near zero variance)
bank_addtl_full <- bank_addtl_full %>% select(-pdays, -default)
# Handling missing data
# Convert 'unknown' to NA
bank_addtl_full[bank_addtl_full == "unknown"] <- NA
# Mode imputation to fill in missing values
mode_job <- names(sort(table(bank_addtl_full$job), decreasing = TRUE))[1]
bank_addtl_full$job[is.na(bank_addtl_full$job)] <- mode_job
mode_marital <- names(sort(table(bank_addtl_full$marital), decreasing = TRUE))[1]
bank_addtl_full$marital[is.na(bank_addtl_full$marital)] <- mode_marital
mode_education <- names(sort(table(bank_addtl_full$education), decreasing = TRUE))[1]
bank_addtl_full$education[is.na(bank_addtl_full$education)] <- mode_education
mode_housing <- names(sort(table(bank_addtl_full$housing), decreasing = TRUE))[1]
bank_addtl_full$housing[is.na(bank_addtl_full$housing)] <- mode_housing
mode_loan <- names(sort(table(bank_addtl_full$loan), decreasing = TRUE))[1]
bank_addtl_full$loan[is.na(bank_addtl_full$loan)] <- mode_loan
# Handling categorical data
# Convert character vectors to factors
bank_addtl_full <- bank_addtl_full %>%
mutate(across(where(is.character), as.factor))
# Relevel target variable to set 'yes' as positive class
bank_addtl_full$y <- relevel(bank_addtl_full$y, ref = "yes")
# Split data into training and testing sets
set.seed(1989)
trainIndex_80 <- createDataPartition(bank_addtl_full$y, p = 0.8, list = FALSE)
trainData_80 <- bank_addtl_full[trainIndex_80, ]
testData_20 <- bank_addtl_full[-trainIndex_80, ]
SVM Training
Linear
# Train a linear SVM using svmLinear
svm_model_linear <- train(
y ~ .,
data = trainData_80,
method = "svmLinear", # Linear support vector machine
trControl = trainControl(method = "cv", number = 5, sampling = "down"), # down sample to balance classes
preProcess = c("center", "scale"),
tuneGrid = expand.grid(C = c(0.01, 0.1, 1, 10))
)
Radial
# Train a SVM using svmRadial
svm_model_radial <- train(
y ~ .,
data = trainData_80,
method = "svmRadial", # Linear support vector machine
trControl = trainControl(method = "cv", number = 5, sampling = "down"), # down sample to balance classes
preProcess = c("zv", "center", "scale"),
tuneGrid = expand.grid(C = c(0.1, 1, 10),
sigma = c(0.01, 0.1, 1))
)
Polynomial
# Train a polynomial SVM using svmPoly
svm_model_polynomial <- train(
y ~ .,
data = trainData_80,
method = "svmPoly", # Polynomial support vector machine
trControl = trainControl(method = "cv", number = 5, sampling = "down"), # down sample to balance classes
preProcess = c("center", "scale"),
tuneGrid = expand.grid(C = c(1, 10),
degree = c(2, 3),
scale = c(0.001, 0.01))
)
Model Features and Parameters
Linear
# Print model and plot results of tuning
print(svm_model_linear)
## Support Vector Machines with Linear Kernel
##
## 32951 samples
## 18 predictor
## 2 classes: 'yes', 'no'
##
## Pre-processing: centered (45), scaled (45)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 26361, 26361, 26361, 26361, 26360
## Addtional sampling using down-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.01 0.8506874 0.4988768
## 0.10 0.8480471 0.4940665
## 1.00 0.8497162 0.4991691
## 10.00 0.8495038 0.4987059
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.01.
plot(svm_model_linear)

# Plot relative feature importance
plot(varImp(svm_model_linear))

Radial
# Print model and plot results of tuning
print(svm_model_radial)
## Support Vector Machines with Radial Basis Function Kernel
##
## 32951 samples
## 18 predictor
## 2 classes: 'yes', 'no'
##
## Pre-processing: centered (45), scaled (45)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 26361, 26360, 26360, 26362, 26361
## Addtional sampling using down-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## C sigma Accuracy Kappa
## 0.1 0.01 0.8513852 0.49036737
## 0.1 0.10 0.5839273 0.18049427
## 0.1 1.00 0.3090804 0.02996622
## 1.0 0.01 0.8486235 0.49657928
## 1.0 0.10 0.8066524 0.41005167
## 1.0 1.00 0.3618101 0.06547393
## 10.0 0.01 0.8491699 0.49410535
## 10.0 0.10 0.8022220 0.39890246
## 10.0 1.00 0.3722191 0.06979386
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01 and C = 0.1.
plot(svm_model_radial)

# Plot relative feature importance
plot(varImp(svm_model_radial))

Polynomial
# Print model and plot results of tuning
print(svm_model_polynomial)
## Support Vector Machines with Polynomial Kernel
##
## 32951 samples
## 18 predictor
## 2 classes: 'yes', 'no'
##
## Pre-processing: centered (45), scaled (45)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 26361, 26361, 26360, 26362, 26360
## Addtional sampling using down-sampling prior to pre-processing
##
## Resampling results across tuning parameters:
##
## C degree scale Accuracy Kappa
## 1 2 0.001 0.8505962 0.4943400
## 1 2 0.010 0.8509300 0.5023723
## 1 3 0.001 0.8505353 0.4959146
## 1 3 0.010 0.8510819 0.5016706
## 10 2 0.001 0.8479256 0.4946421
## 10 2 0.010 0.8526902 0.5041915
## 10 3 0.001 0.8498679 0.4985208
## 10 3 0.010 0.8482594 0.4891450
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 2, scale = 0.01 and C = 10.
plot(svm_model_polynomial)

# Plot relative feature importance
plot(varImp(svm_model_polynomial))

Model Evaluation with Predictions and Confusion Matrices
Linear
# Make predictions
predictions_svm_linear <- predict(svm_model_linear, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_svm_linear, testData_20$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 837 1175
## no 91 6134
##
## Accuracy : 0.8463
## 95% CI : (0.8383, 0.854)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4909
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9019
## Specificity : 0.8392
## Pos Pred Value : 0.4160
## Neg Pred Value : 0.9854
## Prevalence : 0.1127
## Detection Rate : 0.1016
## Detection Prevalence : 0.2443
## Balanced Accuracy : 0.8706
##
## 'Positive' Class : yes
##
Radial
# Make predictions
predictions_svm_radial <- predict(svm_model_radial, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_svm_radial, testData_20$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 811 1123
## no 117 6186
##
## Accuracy : 0.8495
## 95% CI : (0.8416, 0.8571)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4889
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.87392
## Specificity : 0.84635
## Pos Pred Value : 0.41934
## Neg Pred Value : 0.98144
## Prevalence : 0.11266
## Detection Rate : 0.09846
## Detection Prevalence : 0.23479
## Balanced Accuracy : 0.86014
##
## 'Positive' Class : yes
##
Polynomial
# Make predictions
predictions_svm_polynomial <- predict(svm_model_polynomial, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_svm_polynomial, testData_20$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 826 1195
## no 102 6114
##
## Accuracy : 0.8425
## 95% CI : (0.8345, 0.8503)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4799
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8901
## Specificity : 0.8365
## Pos Pred Value : 0.4087
## Neg Pred Value : 0.9836
## Prevalence : 0.1127
## Detection Rate : 0.1003
## Detection Prevalence : 0.2454
## Balanced Accuracy : 0.8633
##
## 'Positive' Class : yes
##