1. Read the following articles:
https://www.hindawi.com/journals/complexity/2021/5550344/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8137961/
2. Search for academic content (at least 3 articles) that compare the use of decision trees vs SVMs in your current area of expertise.
3. Perform an analysis of the dataset used in Homework #2 using the SVM algorithm.
4. Compare the results with the results from previous homework.
5. Answer questions, such as:
Which algorithm is recommended to get more accurate results?
Is it better for classification or regression scenarios?
Do you agree with the recommendations?
Why?
Load Libraries:
Below are the libraries used to complete this assignment
library(tidyverse)
library(dplyr)
library(tidyr)
library(rpart)
library(rpart.plot)
library(lubridate)
library(skimr)
library(stringr)
library(corrplot)
library(ggplot2)
library(fpp3)
library(caret)
library(highcharter)
library(dplyr)
library(randomForest)
library(ROCR)
library(pROC)
library(knitr)
library(kableExtra)
library(e1071) # For SVM
The data used in Assignment2 was
Bank Marketing Dataset.
A Portuguese bank conducted a marketing campaign (phone calls) to predict if a client will subscribe to a term deposit. The records of their efforts are available in the form of a dataset. Bank Marketing Dataset can be download from: https://archive.ics.uci.edu/dataset/222/bank+marketing
set.seed(123)
bank<- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-622/refs/heads/main/bank-full.csv",sep=";")
We will do the same steps we did in the previous homework to prepare the data for the modeling. In addition to the previous preprocessing steps, data scaling was added.
# Replace "unknown" with NA
bank <- bank %>% mutate_all(~ifelse(. == "unknown", NA, .))
# Handle missing values
for (col in names(bank)) {
if (is.factor(bank[[col]])) {
mode_val <- names(sort(table(bank[[col]]), decreasing = TRUE))[1]
bank[[col]][is.na(bank[[col]])] <- mode_val
}
}
# Convert categorical variables to factors
bank <- data.frame(lapply(bank, function(x) if(is.character(x)) factor(x) else x))
# Feature Engineering: Creating age_group
bank$age_group <- cut(bank$age, breaks = c(17, 24, 34, 44, 54, 64, 100),
labels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+"))
# Create a new feature based on call duration
bank <- bank %>% mutate(long_call = if_else(duration > median(duration, na.rm = TRUE), "yes", "no"))
# Feature Engineering: Creating balance_group (income_group)
bank$balance_group <- ifelse(bank$balance <= 500, "low",
ifelse(bank$balance <= 2000, "medium", "high"))
# Convert new features to factors
bank$age_group <- as.factor(bank$age_group)
bank$balance_group <- as.factor(bank$balance_group)
bank$long_call <- as.factor(bank$long_call)
#Remove remaining rows with any NA values to avoid errors
bank <- na.omit(bank)
print(summary(bank))
## age job marital education default
## Min. :18.00 management :1753 divorced: 887 primary :1012 no :7786
## 1st Qu.:32.00 blue-collar:1537 married :4501 secondary:4197 yes: 56
## Median :38.00 technician :1289 single :2454 tertiary :2633
## Mean :40.78 admin. :1057
## 3rd Qu.:47.00 services : 682
## Max. :89.00 retired : 458
## (Other) :1066
## balance housing loan contact day
## Min. :-1884 no :2900 no :6753 cellular :7257 Min. : 1.00
## 1st Qu.: 162 yes:4942 yes:1089 telephone: 585 1st Qu.: 7.00
## Median : 595 Median :14.00
## Mean : 1552 Mean :14.26
## 3rd Qu.: 1734 3rd Qu.:20.00
## Max. :81204 Max. :31.00
##
## month duration campaign pdays
## may :2436 Min. : 5.0 Min. : 1.000 Min. : 1.0
## nov :1093 1st Qu.: 113.0 1st Qu.: 1.000 1st Qu.:133.0
## apr :1075 Median : 194.0 Median : 2.000 Median :195.0
## feb : 881 Mean : 261.3 Mean : 2.064 Mean :223.3
## aug : 493 3rd Qu.: 324.0 3rd Qu.: 2.000 3rd Qu.:326.0
## jan : 472 Max. :2219.0 Max. :16.000 Max. :871.0
## (Other):1392
## previous poutcome y age_group long_call
## Min. : 1.000 failure:4679 no :6056 18-24: 157 no :3652
## 1st Qu.: 1.000 other :1750 yes:1786 25-34:2602 yes:4190
## Median : 2.000 success:1413 35-44:2592
## Mean : 3.184 45-54:1456
## 3rd Qu.: 4.000 55-64: 769
## Max. :275.000 65+ : 266
##
## balance_group
## high :1725
## low :3584
## medium:2533
##
##
##
##
head(bank,10)
## age job marital education default balance housing loan contact
## 24061 33 admin. married tertiary no 882 no no telephone
## 24063 42 admin. single secondary no -247 yes yes telephone
## 24065 33 services married secondary no 3444 yes no telephone
## 24073 36 management married tertiary no 2415 yes no telephone
## 24078 36 management married tertiary no 0 yes no telephone
## 24087 44 blue-collar married secondary no 1324 yes no telephone
## 24123 26 technician single tertiary no 172 no yes telephone
## 24128 51 admin. single secondary no 3132 no no telephone
## 24152 33 unemployed divorced secondary no 1005 yes no telephone
## 24166 30 admin. married secondary no 873 yes no telephone
## day month duration campaign pdays previous poutcome y age_group
## 24061 21 oct 39 1 151 3 failure no 25-34
## 24063 21 oct 519 1 166 1 other yes 35-44
## 24065 21 oct 144 1 91 4 failure yes 25-34
## 24073 22 oct 73 1 86 4 other no 35-44
## 24078 23 oct 140 1 143 3 failure yes 35-44
## 24087 25 oct 119 1 89 2 other no 35-44
## 24123 4 nov 21 1 140 4 other no 25-34
## 24128 5 nov 449 1 176 1 failure no 45-54
## 24152 10 nov 175 1 174 2 failure no 25-34
## 24166 12 nov 119 1 167 3 success no 25-34
## long_call balance_group
## 24061 no medium
## 24063 yes low
## 24065 no high
## 24073 no high
## 24078 no low
## 24087 no medium
## 24123 no low
## 24128 yes high
## 24152 no medium
## 24166 no medium
# Slit the data (70% training, 30% testing)
trainIndex <- createDataPartition(bank$y, p = 0.7, list = FALSE)
trainData <- bank[trainIndex, ]
testData <- bank[-trainIndex, ]
# Check the distribution of target variable in both sets
prop.table(table(trainData$y))
##
## no yes
## 0.7721726 0.2278274
prop.table(table(testData$y))
##
## no yes
## 0.7724373 0.2275627
# Data Scaling (Standardization)
numeric_cols <- sapply(bank, is.numeric)
preprocess_obj <- preProcess(trainData[, numeric_cols], method = c("center", "scale"))
trainData[, numeric_cols] <- predict(preprocess_obj, trainData[, numeric_cols])
testData[, numeric_cols] <- predict(preprocess_obj, testData[, numeric_cols])
The dataset is loaded, cleaned, and preprocessed to ensure all variables are correctly formatted, and any missing values are handled. Feature engineering steps were included to create additional features that may improve model performance.
We now will continue with our experiments, by Support Vector Machine Implementation.
Hypothesis: Linear kernel with the default cost of 1 will be a better model for making classifications on the data than algorithms in previous assignment
The linear kernel SVM serves as a foundation model. It assumes a linear relationship between the features and the target variable (customer subscription). The model should work adequately when data exhibits linear separability yet might fail to perform well when the relationships between variables become highly non-linear.
# SVM with Linear Kernel
set.seed(123)
svm_linear <- svm(y ~ ., data=trainData, kernel="linear", probability=TRUE)
summary(svm_linear)
##
## Call:
## svm(formula = y ~ ., data = trainData, kernel = "linear", probability = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 1919
##
## ( 967 952 )
##
##
## Number of Classes: 2
##
## Levels:
## no yes
svm_linear_pred <- predict(svm_linear, testData)
svm_linear_prob <- predict(svm_linear, testData, probability=TRUE)
svm_linear_cm <- confusionMatrix(svm_linear_pred, testData$y, positive="yes")
svm_linear_roc <- roc(testData$y, as.numeric(attr(svm_linear_prob, "probabilities")[,2]))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
svm_linear_cm$overall["Accuracy"]
## Accuracy
## 0.8434709
svm_linear_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1690 242
## yes 126 293
##
## Accuracy : 0.8435
## 95% CI : (0.8281, 0.8579)
## No Information Rate : 0.7724
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5179
##
## Mcnemar's Test P-Value : 2.037e-09
##
## Sensitivity : 0.5477
## Specificity : 0.9306
## Pos Pred Value : 0.6993
## Neg Pred Value : 0.8747
## Prevalence : 0.2276
## Detection Rate : 0.1246
## Detection Prevalence : 0.1782
## Balanced Accuracy : 0.7391
##
## 'Positive' Class : yes
##
The default linear SVM achieved an accuracy of approximately 84.35% and significant predictive power (p-value <2.2e-16), but with better performance on the negative class than the positive. This suggests that there is some degree of linear separability in the data. This performance provides a benchmark against which to compare the other models. Given the relatively high accuracy, a linear relationship exists between certain features and the target variable.
The default SVM with linear kernel performed a little better than default decision tree with accuracy 83.79%. Comparing the values of the confusion matrix, default SVM with linear kernel has slightly higher counts of correct predictions (1690 true negatives and 242 false positives), compare to the default decision tree model (1669 true negatives and 234 false positives)
Hypothesis: Tuning cost parameter will improve linear SVM performance
Our goal to enhance linear SVM performance involves adjusting the cost parameter (C) to balance the trade-off between low training error and low generalization error. A high cost parameter reduces training errors at the risk of overfitting, but a low cost parameter emphasizes a larger margin which means more training errors yet offers better generalization.
# Define the grid of cost values to test
tune_grid <- expand.grid(cost = c(0.001, 0.01, 0.1, 1, 5, 10))
# Perform grid search with cross-validation
set.seed(123)
tune_control <- tune.control(cross = 5) # 5-fold cross-validation
svm_tune <- tune.svm(y ~ ., data = trainData, kernel = "linear",
cost = tune_grid$cost,
tunecontrol = tune_control)
# Print the best model
print(svm_tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 5-fold cross validation
##
## - best parameters:
## cost
## 5
##
## - best performance: 0.1586254
# Get the best cost value
best_cost <- svm_tune$best.parameters$cost
Now that we have identified the best cost value which is
5 , we can train the SVM model using this optimized parameter. This
should theoretically give us a model that generalizes better to unseen
data compared to using a default or arbitrary cost
value.
# Train the SVM model with the best cost
set.seed(123)
svm_linear_tuned <- svm(y ~ ., data=trainData, kernel="linear", cost=best_cost, probability=TRUE)
# Make predictions on the test data
svm_linear_pred_tuned <- predict(svm_linear_tuned, testData)
svm_linear_prob_tuned <- predict(svm_linear_tuned, testData, probability=TRUE)
# Evaluate the tuned model
svm_linear_cm_tuned <- confusionMatrix(svm_linear_pred_tuned, testData$y, positive="yes")
svm_linear_roc_tuned <- roc(testData$y, as.numeric(attr(svm_linear_prob_tuned, "probabilities")[,2]))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Print the results
print(svm_linear_cm_tuned$overall["Accuracy"])
## Accuracy
## 0.8421948
print(svm_linear_cm_tuned)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1688 243
## yes 128 292
##
## Accuracy : 0.8422
## 95% CI : (0.8268, 0.8567)
## No Information Rate : 0.7724
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5143
##
## Mcnemar's Test P-Value : 3.247e-09
##
## Sensitivity : 0.5458
## Specificity : 0.9295
## Pos Pred Value : 0.6952
## Neg Pred Value : 0.8742
## Prevalence : 0.2276
## Detection Rate : 0.1242
## Detection Prevalence : 0.1786
## Balanced Accuracy : 0.7377
##
## 'Positive' Class : yes
##
Tuned model didn’t perform better, that may suggest that the initial cost value was already close to optimal, or that further tuning with a different range of cost values, or other kernels, might be necessary. Therefore we will continue with the next experiment to see if changing the model from linear to Radial Basis Function will improve the performance.The RBF kernel is a popular choice for non-linear data.
Hypothesis: Radial kernel will capture non-linear relationships better
The radial kernel SVM should outperform the linear kernel when feature-target relationships show non-linear patterns. The radial kernel transforms data into a high-dimensional space which simplifies the task of locating a separating hyperplane. The SVM with the radial kernel defaults might fail to capture specific non-linear patterns in the data because it is not optimally configured.
# SVM with Radial Kernel
svm_radial <- svm(y ~ ., data=trainData, kernel="radial", probability=TRUE)
summary(svm_radial)
##
## Call:
## svm(formula = y ~ ., data = trainData, kernel = "radial", probability = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 2090
##
## ( 1062 1028 )
##
##
## Number of Classes: 2
##
## Levels:
## no yes
# Make predictions on the test data
svm_radial_pred <- predict(svm_radial, testData)
svm_radial_prob <- predict(svm_radial, testData, probability=TRUE)
# Create confusion matrix
svm_radial_cm <- confusionMatrix(svm_radial_pred, testData$y, positive="yes")
# Calculate accuracy
accuracy <- sum(svm_radial_cm$table[1, 1], svm_radial_cm$table[2, 2]) / sum(svm_radial_cm$table)
cat("Accuracy of the SVM model with radial kernel:", accuracy, "\n")
## Accuracy of the SVM model with radial kernel: 0.8434709
# ROC analysis
svm_radial_roc <- roc(testData$y, as.numeric(attr(svm_radial_prob, "probabilities")[, 2]))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
The default radial SVM achieved an accuracy of approximately 84.34%. This is almost the same - slightly lower than the default linear SVM. I can assume that the default parameters for the radial kernel (specifically gamma) are not well-suited for this dataset, or that the data doesn’t have strong non-linear relationships that a radial kernel can effectively exploit without tuning.
Hypothesis: Tuning both cost and gamma will improve radial SVM performance
Tuning parameters of the radial kernel SVM should significantly improve its performance. By optimizing parameters the model can better capture complex non-linear relationships in the data, leading to higher accuracy and better generalization.
# Define the parameter grid for tuning
set.seed(123)
tune_grid <- expand.grid(
C = c(0.001, 0.01, 0.1, 1, 5, 10),
sigma = c(0.001, 0.01, 0.1, 1, 5, 10)
)
The caret package required me to name the column ‘sigma’ in the tuneGrid instead of ‘gamma’, otherwise it gave me an “Error: The tuning parameter grid should have columns sigma, C” at the tuning step.
# Set up cross-validation
fitControl <- trainControl(
method = "cv",
number = 5, # Number of folds
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = TRUE
)
# Tune the SVM model
svm_tune <- train(
y ~ .,
data = trainData,
method = "svmRadial",
trControl = fitControl,
tuneGrid = tune_grid,
metric = "ROC"
)
## maximum number of iterations reached 0.003345839 0.003239638maximum number of iterations reached 0.003087249 0.003072615maximum number of iterations reached 0.001178753 0.001144268maximum number of iterations reached 0.001117506 0.001116722maximum number of iterations reached 0.005238034 0.005130426maximum number of iterations reached 0.002004685 0.001882038maximum number of iterations reached 0.0003012866 0.000301132maximum number of iterations reached 0.004213635 0.004165085maximum number of iterations reached 0.00339293 0.003181577maximum number of iterations reached 0.0003811696 0.0002045011maximum number of iterations reached 0.002511908 0.002443636maximum number of iterations reached 0.001517628 0.001514044maximum number of iterations reached 0.000233901 0.0002313684maximum number of iterations reached 0.0009662853 0.0009656456maximum number of iterations reached 0.005221456 0.005099211maximum number of iterations reached 0.000335544 0.0003353729maximum number of iterations reached 0.003285356 0.003240082maximum number of iterations reached 0.001708514 0.001621241maximum number of iterations reached 0.001465466 0.001443217maximum number of iterations reached 0.002599005 0.002590565maximum number of iterations reached 0.0002307158 0.0002280651maximum number of iterations reached 0.001229604 0.001228662maximum number of iterations reached 0.004786468 0.004768692maximum number of iterations reached 0.004245481 0.003890546maximum number of iterations reached 0.0005014895 0.0005012229maximum number of iterations reached 0.002196009 0.002180292maximum number of iterations reached 0.003082233 0.002908512maximum number of iterations reached 0.002020457 0.001985997maximum number of iterations reached 0.002211448 0.002203454maximum number of iterations reached 0.0002749228 0.000271442maximum number of iterations reached 0.0009237574 0.0009231493maximum number of iterations reached 0.004779489 0.004698025maximum number of iterations reached 0.0006040669 0.0006037402maximum number of iterations reached 0.001913937 0.00190455maximum number of iterations reached 0.001492318 0.001435047maximum number of iterations reached 0.002264345 0.002214342maximum number of iterations reached 0.002468082 0.002459588maximum number of iterations reached 0.001186509 0.001148171maximum number of iterations reached 0.0006989233 0.0006985063maximum number of iterations reached 0.005942946 0.005722294maximum number of iterations reached 4.063738e-05 3.933031e-05maximum number of iterations reached 0.0004205293 0.0004203052maximum number of iterations reached 0.002897668 0.002861916maximum number of iterations reached 0.00229662 0.002172998
# Print the best tuning parameters
print(svm_tune$bestTune)
## sigma C
## 20 0.01 1
# Make predictions using the best model
svm_tuned_pred <- predict(svm_tune, testData)
svm_tuned_prob <- predict(svm_tune, testData, type = "prob")
# Evaluate the tuned model
svm_tuned_cm <- confusionMatrix(svm_tuned_pred, testData$y, positive = "yes")
print(svm_tuned_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1706 256
## yes 110 279
##
## Accuracy : 0.8443
## 95% CI : (0.829, 0.8588)
## No Information Rate : 0.7724
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.51
##
## Mcnemar's Test P-Value : 3.475e-14
##
## Sensitivity : 0.5215
## Specificity : 0.9394
## Pos Pred Value : 0.7172
## Neg Pred Value : 0.8695
## Prevalence : 0.2276
## Detection Rate : 0.1187
## Detection Prevalence : 0.1655
## Balanced Accuracy : 0.7305
##
## 'Positive' Class : yes
##
Unfortunately, Tuned Radial Kernel SVM didn’t show much improvement. Both models, Radial Kernel SVM (Default) and Tuned Radial Kernel SVM show similar accuracy 0.843 and 0.845 respectively.
plot_multiple_roc <- function(list_of_rocs, model_names) {
plot(list_of_rocs[[1]], col = 1, main = "ROC Curves Comparison")
for(i in 2:length(list_of_rocs)) {
lines(list_of_rocs[[i]], col = i)
}
legend("bottomright", legend = model_names, col = 1:length(list_of_rocs), lwd = 2)
}
# Store ROC objects
roc_list <- list(
svm_linear_roc,
svm_linear_roc_tuned,
svm_radial_roc,
roc(testData$y, svm_tuned_prob[,"yes"])
)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Plot ROC curves
plot_multiple_roc(roc_list,
c("Linear SVM", "Tuned Linear SVM",
"Radial SVM", "Tuned Radial SVM"))
performance_metrics <- data.frame(
Model = c("Linear SVM", "Tuned Linear SVM",
"Radial SVM", "Tuned Radial SVM"),
Accuracy = c(svm_linear_cm$overall['Accuracy'],
svm_linear_cm_tuned$overall['Accuracy'],
svm_radial_cm$overall['Accuracy'],
svm_tuned_cm$overall['Accuracy']),
Precision = c(svm_linear_cm$byClass['Pos Pred Value'],
svm_linear_cm_tuned$byClass['Pos Pred Value'],
svm_radial_cm$byClass['Pos Pred Value'],
svm_tuned_cm$byClass['Pos Pred Value']),
Recall = c(svm_linear_cm$byClass['Sensitivity'],
svm_linear_cm_tuned$byClass['Sensitivity'],
svm_radial_cm$byClass['Sensitivity'],
svm_tuned_cm$byClass['Sensitivity']),
F1_Score = c(svm_linear_cm$byClass['F1'],
svm_linear_cm_tuned$byClass['F1'],
svm_radial_cm$byClass['F1'],
svm_tuned_cm$byClass['F1'])
)
# Visualize performance metrics
performance_long <- gather(performance_metrics,
Metric, Value, -Model)
ggplot(performance_long, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Performance Comparison of SVM Models",
y = "Score", x = "Model") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Performance Metrics Table Creation
performance_metrics <- data.frame(
Model = c("SVM Linear", "SVM Tuned Linear", "SVM Radial", "SVM Tuned Radial"),
Accuracy = c(svm_linear_cm$overall['Accuracy'], svm_linear_cm_tuned$overall['Accuracy'], svm_radial_cm$overall['Accuracy'], svm_tuned_cm$overall['Accuracy']),
Sensitivity = c(svm_linear_cm$byClass['Sensitivity'], svm_linear_cm_tuned$byClass['Sensitivity'], svm_radial_cm$byClass['Sensitivity'], svm_tuned_cm$byClass['Sensitivity']),
Specificity = c(svm_linear_cm$byClass['Specificity'], svm_linear_cm_tuned$byClass['Specificity'], svm_radial_cm$byClass['Specificity'], svm_tuned_cm$byClass['Specificity']),
F1_Score = c(svm_linear_cm$byClass['F1'],
svm_linear_cm_tuned$byClass['F1'],
svm_radial_cm$byClass['F1'],
svm_tuned_cm$byClass['F1'])
)
# Display Performance Metrics Table
kable(performance_metrics, format = "html") %>%
kableExtra::kable_styling(full_width = F)
## Warning: 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
| Model | Accuracy | Sensitivity | Specificity | F1_Score |
|---|---|---|---|---|
| SVM Linear | 0.8434709 | 0.5476636 | 0.9306167 | 0.6142558 |
| SVM Tuned Linear | 0.8421948 | 0.5457944 | 0.9295154 | 0.6115183 |
| SVM Radial | 0.8434709 | 0.5121495 | 0.9410793 | 0.5982533 |
| SVM Tuned Radial | 0.8443216 | 0.5214953 | 0.9394273 | 0.6038961 |
The accuracy values of all four models fall between 0.842 and 0.845, showing that they classify nearly identical percentages of instances correctly. Sensitivity shows greater variability between the models than other performance metrics. The SVM Tuned Radial model achieves a slightly better sensitivity score of 0.525 compared to other models which show sensitivity scores from 0.512 to 0.548. Sensitivity measures how well the model detects positive cases accurately.
The models maintain consistent specificity values ranging from 0.929 to 0.941 which demonstrates their strong ability to identify negative cases correctly. Selecting the appropriate model requires evaluating the priority between sensitivity and specificity for the planned use. The SVM Tuned Radial model becomes the preferred choice when identifying positive cases accurately because it shows slightly better sensitivity than other models. The small differences between models indicate that all four perform at a similar level of effectiveness.
According to previous experiments SVM models achieved less accuracy than Random Forest models with accuracy of 85% SVM models show variable sensitivity and specificity values. The radial SVM model with tuning demonstrates strong negative case identification abilities through its high specificity rating of 0.9388767. Random Forest models show superior performance over SVM models both in terms of accuracy and sensitivity but Decision Tree models achieve the least level of accuracy.
Random Forest models demonstrate superior accuracy and sensitivity compared to SVM models in this analysis but the choice of the most appropriate model needs to be aligned with the unique demands and priorities of each application. For applications that require high specificity the tuned radial SVM presents a viable solution. Decision Tree models show inferior performance levels compared to other models.
Demonstrate provided articles were read by, drawing insights, summarizing articles or via comparison:
https://www.hindawi.com/journals/complexity/2021/5550344/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8137961/
The two articles present analyses which demonstrate how decision tree ensemble methods can predict Covid-19 infections from lab data by handling imbalanced datasets and emphasizing correct machine learning techniques and evaluation metrics. The study demonstrates the effectiveness of ensemble methods for imbalanced datasets and shows age as a critical factor in prediction models. The two articles acknowledge the challenge presented by imbalanced datasets in Covid-19 infection prediction. The first dataset contains 600 patient samples which demonstrates a 1:6.5.
The second dataset includes 5644 patients where positive cases account for approximately 10%. Class imbalance leads to biased models necessitating special techniques for correction. These methods demonstrate robust operation and accurate outcomes when applied to unbalanced datasets. The evaluation metrics accuracy, precision, recall, F1-measure, AUC-ROC and AUPRC were employed in both studies and results demonstrate that classifiers designed for imbalanced data sets achieve superior outcomes. Balanced random forest (RUS) outperformed other methods according to AUPRC metrics while RUSBagging yielded superior AUROC results. Merging age information with laboratory test data enhances predictive accuracy. Studies failed to achieve high accuracy estimates because they ignored age as a significant factor.
https://medium.com/@jangdaehan1/svm-versus-decision-trees-a-comparative-analysis-in-supervised-learning-07e6fcc14ecd
This analytical piece reviews Support Vector Machines (SVM) and Decision Trees by examining their methods and benefits while addressing their challenges and practical use cases in supervised learning. Support Vector Machines perform well in high-dimensional data spaces and provide strong resistance to overfitting whereas Decision Trees provide clear interpretability and user-friendly application despite being susceptible to overfitting. The discussion presents performance comparisons along with contextual application significance while emphasizing the vital need for informed algorithm selection in the evolving artificial intelligence domain.
https://www.coursera.org/articles/difference-between-svm-and-decision-tree
This article examines how Support Vector Machines (SVMs) and decision trees function as machine learning models for data classification and describes their respective mechanisms while assessing their benefits and challenges and practical applications. Support Vector Machines function well in spaces with many dimensions and offer versatility through various kernel functions whereas decision trees provide easy comprehension alongside flexibility with diverse data types and can be applied to classification and regression problems. The selection process between SVMs and decision trees should be based on the specific requirements of a project and its intended application.
https://scialert.net/fulltext/?doi=itj.2009.64.70
The study compares how well Support Vector Machines (SVM) and Decision Tree (DT) methods classify satellite imagery data from Langkawi Island in terms of accuracy. In this image classification task the SVM Radial Basis function demonstrated superior performance with overall accuracy of 76.0004% compared to Decision Tree method which achieved 68.7846%.
Researchers implemented Decision Tree (DT) and Support Vector Machine (SVM) algorithms to analyze SPOT 5 satellite imagery. The development of DT rules was carried out manually through analysis of Normalized Difference Vegetation Index (NDVI) and Brightness Value (BV) variables. The SVM method was implemented automatically using four kernel types: linear, polynomial, radial basis function, and sigmoid.
This assignment focused on using the Support Vector Machine (SVM) algorithm to examine the dataset from Homework #2 and then compared the results with previous assignments.
The Bank Marketing Dataset serves as the foundation for data analysis in Assignment 2. The Bank Marketing Dataset comes from a Portuguese bank campaign that predicted client term deposit subscriptions through phone call interactions. You can download the dataset which contains records of these marketing activities from https://archive.ics.uci.edu/dataset/222/bank+marketing.
At the start of the project the team prepared the dataset which was to be used for modeling. The data preparation process consisted of three steps: loading the data, cleaning it to remove inconsistencies and format variables properly, and pre-processing to handle missing values. During the data preparation phase feature engineering activities led to the creation of new features which might improve model performance. During the data preparation phase of this assignment I implemented data scaling subsequent to dataset splitting to avoid data leakage.
Experiment 8: SVM with Linear Kernel
Hypothesis: Linear kernel with the default cost of 1 will be a better model for making classifications on the data than algorithms in previous assignment
The linear kernel SVM serves as a foundation model. It assumes a linear relationship between the features and the target variable (customer subscription). The model should work adequately when data exhibits linear separability yet might fail to perform well when the relationships between variables become highly non-linear.
The svm function from the e1071 package
enabled the construction of a linear kernel SVM model. This experiment
tested if a linear kernel with its default cost parameter would produce
superior classification outcomes in comparison to algorithms from the
prior assignment.
The default linear SVM achieved an accuracy of approximately 84.35% and significant predictive power (p-value <2.2e-16), but with better performance on the negative class than the positive. This suggests that there is some degree of linear separability in the data. This performance provides a benchmark against which to compare the other models. Given the relatively high accuracy, a linear relationship exists between certain features and the target variable.
The default SVM with linear kernel performed a little better than default decision tree with accuracy 83.79% Comparing the values of the confusion matrix, default SVM with linear kernel has slightly higher counts of correct predictions, compare to the default decision tree model.
Experiment 9: Tuned Linear Kernel SVM
Hypothesis: Tuning cost parameter will improve linear SVM performance
Our goal to enhance linear SVM performance involves adjusting the cost parameter (C) to balance the trade-off between low training error and low generalization error. A high cost parameter reduces training errors at the risk of overfitting, but a low cost parameter emphasizes a larger margin which means more training errors yet offers better generalization.
In the subsequent experiment aimed at enhancing the linear SVM model,
a grid search accompanied by cross-validation was conducted utilizing
tune.svm. The procedure involved defining a range of cost
values and implementing 5-fold cross-validation to identify the optimal
cost parameter.
The selected best cost value of 5 was then employed to
train a refined SVM model. This model achieved an accuracy of 0.8421,
which is almost the same as tuned Decision Tree accuracy of 84.17% in
the previous experiment.
The Tuned SVM with Linear kernel did not surpass expectations, indicating that the initial cost value may have been near optimal, or that additional tuning with a broader spectrum of cost values or alternative kernels could be warranted. Therefore, I proceeded to the next experiment to assess whether transitioning from a linear model to a Radial Basis Function (RBF) model would enhance performance.
Experiment 10: Radial Kernel SVM (Default)
Hypothesis: Radial kernel will capture non-linear relationships better
The assumption for this experiment was that the radial kernel SVM should outperform the linear kernel when feature-target relationships show non-linear patterns. The radial kernel transforms data into a high-dimensional space which simplifies the task of locating a separating hyperplane. The SVM with the radial kernel defaults might fail to capture specific non-linear patterns in the data because it is not optimally configured.
Ultimately, an SVM model utilizing a radial kernel was constructed, followed by a similar evaluation process.
The default radial SVM achieved an accuracy of approximately 84.34%. This is almost the same - slightly lower than the default linear SVM 84.35% I can assume that the default parameters for the radial kernel (specifically gamma) are not well-suited for this dataset, or that the data doesn’t have strong non-linear relationships that a radial kernel can effectively exploit without tuning.
Experiment 11: Tuned Radial Kernel SVM
Hypothesis: Tuning both cost and gamma will improve radial SVM performance
The expectation from this experiment was that tuning parameters of the radial kernel SVM should significantly improve its performance. By optimizing parameters the model can better capture complex non-linear relationships in the data, leading to higher accuracy and better generalization. Unfortunately, Tuned Radial Kernel SVM didn’t show much improvement. Both models, Radial Kernel SVM (Default) and Tuned Radial Kernel SVM show similar accuracy 0.843 and 0.845 respectively.
Overall, all Four models show similar accuracy rates from 0.842 to 0.845 indicating they classify about the same percentage of cases correctly. The models show greater differences in sensitivity measurements compared to other metrics. The SVM Tuned Radial model achieves a marginally better sensitivity at 0.525 when compared to other models which show sensitivity values between 0.512 and 0.548. The sensitivity metric measures how well the model detects positive cases.
The models maintain consistent specificity levels between 0.929 and 0.941 which demonstrates their efficiency at accurately identifying negative cases. The choice of model depends on how important sensitivity compares to specificity in your specific application. When accurate identification of positive cases becomes essential, users might choose the SVM Tuned Radial model because it demonstrates slightly better sensitivity. The performance variations between the four models remain minor which demonstrates that they all deliver equivalent effectiveness.
Compare with the previous models
Random Forest model with its default settings reached peak accuracy of 0.8545300 along with high sensitivity at 0.9207040. The Random Forest model has a lower specificity score of 0.6299065 when compared to the SVM Tuned Radial model which achieves a specificity of 0.9394
Despite achieving superior specificity performance the SVM Tuned Radial model exhibits inferior accuracy and sensitivity compared to the Random Forest model. The selection of the best model relies on the specific requirements of the application. The SVM Tuned Radial model becomes the top choice for applications where high specificity is most important. When high accuracy and sensitivity are required Random Forest model proves to be the superior option. To reach a definitive conclusion an evaluation must be conducted which takes into account false positive and false negative consequences.
Additional context and insights from the literature underline the critical role of kernel selection and hyperparameter tuning while showcasing the trade-offs that exist among various modeling approaches. The best model choice emerges from an analysis of the dataset features together with the marketing campaign objectives.
Model choice needs to align with the organization’s business objectives. The bank needs to evaluate various models and modify thresholds to enhance recall so it discovers more potential subscribers. Organizations prioritize Decision Trees for model decisions transparency even though these models may exhibit lower accuracy than complex models. Random Forests enhance their interpretability by using feature importance rankings.
library(tidyverse)
library(dplyr)
library(tidyr)
library(rpart)
library(rpart.plot)
library(lubridate)
library(skimr)
library(stringr)
library(corrplot)
library(ggplot2)
library(fpp3)
library(caret)
library(highcharter)
library(dplyr)
library(randomForest)
library(ROCR)
library(pROC)
library(knitr)
library(kableExtra)
library(e1071) # For SVM
set.seed(123)
bank<- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-622/refs/heads/main/bank-full.csv",sep=";")
# Replace "unknown" with NA
bank <- bank %>% mutate_all(~ifelse(. == "unknown", NA, .))
# Handle missing values
for (col in names(bank)) {
if (is.factor(bank[[col]])) {
mode_val <- names(sort(table(bank[[col]]), decreasing = TRUE))[1]
bank[[col]][is.na(bank[[col]])] <- mode_val
}
}
# Convert categorical variables to factors
bank <- data.frame(lapply(bank, function(x) if(is.character(x)) factor(x) else x))
# Feature Engineering: Creating age_group
bank$age_group <- cut(bank$age, breaks = c(17, 24, 34, 44, 54, 64, 100),
labels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+"))
# Create a new feature based on call duration
bank <- bank %>% mutate(long_call = if_else(duration > median(duration, na.rm = TRUE), "yes", "no"))
# Feature Engineering: Creating balance_group (income_group)
bank$balance_group <- ifelse(bank$balance <= 500, "low",
ifelse(bank$balance <= 2000, "medium", "high"))
# Convert new features to factors
bank$age_group <- as.factor(bank$age_group)
bank$balance_group <- as.factor(bank$balance_group)
bank$long_call <- as.factor(bank$long_call)
#Remove remaining rows with any NA values to avoid errors
bank <- na.omit(bank)
print(summary(bank))
head(bank,10)
# Slit the data (70% training, 30% testing)
trainIndex <- createDataPartition(bank$y, p = 0.7, list = FALSE)
trainData <- bank[trainIndex, ]
testData <- bank[-trainIndex, ]
# Check the distribution of target variable in both sets
prop.table(table(trainData$y))
prop.table(table(testData$y))
# Data Scaling (Standardization)
numeric_cols <- sapply(bank, is.numeric)
preprocess_obj <- preProcess(trainData[, numeric_cols], method = c("center", "scale"))
trainData[, numeric_cols] <- predict(preprocess_obj, trainData[, numeric_cols])
testData[, numeric_cols] <- predict(preprocess_obj, testData[, numeric_cols])
# SVM with Linear Kernel
set.seed(123)
svm_linear <- svm(y ~ ., data=trainData, kernel="linear", probability=TRUE)
summary(svm_linear)
svm_linear_pred <- predict(svm_linear, testData)
svm_linear_prob <- predict(svm_linear, testData, probability=TRUE)
svm_linear_cm <- confusionMatrix(svm_linear_pred, testData$y, positive="yes")
svm_linear_roc <- roc(testData$y, as.numeric(attr(svm_linear_prob, "probabilities")[,2]))
svm_linear_cm$overall["Accuracy"]
svm_linear_cm
# Define the grid of cost values to test
tune_grid <- expand.grid(cost = c(0.001, 0.01, 0.1, 1, 5, 10))
# Perform grid search with cross-validation
set.seed(123)
tune_control <- tune.control(cross = 5) # 5-fold cross-validation
svm_tune <- tune.svm(y ~ ., data = trainData, kernel = "linear",
cost = tune_grid$cost,
tunecontrol = tune_control)
# Print the best model
print(svm_tune)
# Get the best cost value
best_cost <- svm_tune$best.parameters$cost
# Train the SVM model with the best cost
set.seed(123)
svm_linear_tuned <- svm(y ~ ., data=trainData, kernel="linear", cost=best_cost, probability=TRUE)
# Make predictions on the test data
svm_linear_pred_tuned <- predict(svm_linear_tuned, testData)
svm_linear_prob_tuned <- predict(svm_linear_tuned, testData, probability=TRUE)
# Evaluate the tuned model
svm_linear_cm_tuned <- confusionMatrix(svm_linear_pred_tuned, testData$y, positive="yes")
svm_linear_roc_tuned <- roc(testData$y, as.numeric(attr(svm_linear_prob_tuned, "probabilities")[,2]))
# Print the results
print(svm_linear_cm_tuned$overall["Accuracy"])
print(svm_linear_cm_tuned)
# SVM with Radial Kernel
svm_radial <- svm(y ~ ., data=trainData, kernel="radial", probability=TRUE)
summary(svm_radial)
# Make predictions on the test data
svm_radial_pred <- predict(svm_radial, testData)
svm_radial_prob <- predict(svm_radial, testData, probability=TRUE)
# Create confusion matrix
svm_radial_cm <- confusionMatrix(svm_radial_pred, testData$y, positive="yes")
# Calculate accuracy
accuracy <- sum(svm_radial_cm$table[1, 1], svm_radial_cm$table[2, 2]) / sum(svm_radial_cm$table)
cat("Accuracy of the SVM model with radial kernel:", accuracy, "\n")
# ROC analysis
svm_radial_roc <- roc(testData$y, as.numeric(attr(svm_radial_prob, "probabilities")[, 2]))
# Define the parameter grid for tuning
set.seed(123)
tune_grid <- expand.grid(
C = c(0.001, 0.01, 0.1, 1, 5, 10),
sigma = c(0.001, 0.01, 0.1, 1, 5, 10)
)
# Set up cross-validation
fitControl <- trainControl(
method = "cv",
number = 5, # Number of folds
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = TRUE
)
# Tune the SVM model
svm_tune <- train(
y ~ .,
data = trainData,
method = "svmRadial",
trControl = fitControl,
tuneGrid = tune_grid,
metric = "ROC"
)
# Print the best tuning parameters
print(svm_tune$bestTune)
# Make predictions using the best model
svm_tuned_pred <- predict(svm_tune, testData)
svm_tuned_prob <- predict(svm_tune, testData, type = "prob")
# Evaluate the tuned model
svm_tuned_cm <- confusionMatrix(svm_tuned_pred, testData$y, positive = "yes")
print(svm_tuned_cm)
plot_multiple_roc <- function(list_of_rocs, model_names) {
plot(list_of_rocs[[1]], col = 1, main = "ROC Curves Comparison")
for(i in 2:length(list_of_rocs)) {
lines(list_of_rocs[[i]], col = i)
}
legend("bottomright", legend = model_names, col = 1:length(list_of_rocs), lwd = 2)
}
# Store ROC objects
roc_list <- list(
svm_linear_roc,
svm_linear_roc_tuned,
svm_radial_roc,
roc(testData$y, svm_tuned_prob[,"yes"])
)
# Plot ROC curves
plot_multiple_roc(roc_list,
c("Linear SVM", "Tuned Linear SVM",
"Radial SVM", "Tuned Radial SVM"))
performance_metrics <- data.frame(
Model = c("Linear SVM", "Tuned Linear SVM",
"Radial SVM", "Tuned Radial SVM"),
Accuracy = c(svm_linear_cm$overall['Accuracy'],
svm_linear_cm_tuned$overall['Accuracy'],
svm_radial_cm$overall['Accuracy'],
svm_tuned_cm$overall['Accuracy']),
Precision = c(svm_linear_cm$byClass['Pos Pred Value'],
svm_linear_cm_tuned$byClass['Pos Pred Value'],
svm_radial_cm$byClass['Pos Pred Value'],
svm_tuned_cm$byClass['Pos Pred Value']),
Recall = c(svm_linear_cm$byClass['Sensitivity'],
svm_linear_cm_tuned$byClass['Sensitivity'],
svm_radial_cm$byClass['Sensitivity'],
svm_tuned_cm$byClass['Sensitivity']),
F1_Score = c(svm_linear_cm$byClass['F1'],
svm_linear_cm_tuned$byClass['F1'],
svm_radial_cm$byClass['F1'],
svm_tuned_cm$byClass['F1'])
)
# Visualize performance metrics
performance_long <- gather(performance_metrics,
Metric, Value, -Model)
ggplot(performance_long, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Performance Comparison of SVM Models",
y = "Score", x = "Model") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Performance Metrics Table Creation
performance_metrics <- data.frame(
Model = c("SVM Linear", "SVM Tuned Linear", "SVM Radial", "SVM Tuned Radial"),
Accuracy = c(svm_linear_cm$overall['Accuracy'], svm_linear_cm_tuned$overall['Accuracy'], svm_radial_cm$overall['Accuracy'], svm_tuned_cm$overall['Accuracy']),
Sensitivity = c(svm_linear_cm$byClass['Sensitivity'], svm_linear_cm_tuned$byClass['Sensitivity'], svm_radial_cm$byClass['Sensitivity'], svm_tuned_cm$byClass['Sensitivity']),
Specificity = c(svm_linear_cm$byClass['Specificity'], svm_linear_cm_tuned$byClass['Specificity'], svm_radial_cm$byClass['Specificity'], svm_tuned_cm$byClass['Specificity']),
F1_Score = c(svm_linear_cm$byClass['F1'],
svm_linear_cm_tuned$byClass['F1'],
svm_radial_cm$byClass['F1'],
svm_tuned_cm$byClass['F1'])
)
# Display Performance Metrics Table
kable(performance_metrics, format = "html") %>%
kableExtra::kable_styling(full_width = F)