# Load dataset
dataBank <- read.csv("C:/Users/vitug/OneDrive/Desktop/CUNY Masters/DATA_622/bank data.csv", stringsAsFactors = TRUE)
head(dataBank, 10)
## X age job marital education default balance housing loan contact
## 1 1 58 management married tertiary no 2143 yes no unknown
## 2 2 44 technician single secondary no 29 yes no unknown
## 3 3 33 entrepreneur married secondary no 2 yes yes unknown
## 4 4 47 blue-collar married unknown no 1506 yes no unknown
## 5 5 33 unknown single unknown no 1 no no unknown
## 6 6 35 management married tertiary no 231 yes no unknown
## 7 7 28 management single tertiary no 447 yes yes unknown
## 8 8 42 entrepreneur divorced tertiary yes 2 yes no unknown
## 9 9 58 retired married primary no 121 yes no unknown
## 10 10 43 technician single secondary no 593 yes no unknown
## day month campaign previous term age_group credit_risk Subscription
## 1 5 may 1 0 no Senior Medium Risk no
## 2 5 may 1 0 no Middle-aged Medium Risk no
## 3 5 may 1 0 no Middle-aged High Risk no
## 4 5 may 1 0 no Middle-aged Medium Risk no
## 5 5 may 1 0 no Middle-aged Medium Risk no
## 6 5 may 1 0 no Middle-aged Medium Risk no
## 7 5 may 1 0 no Middle-aged High Risk no
## 8 5 may 1 0 no Middle-aged Medium Risk no
## 9 5 may 1 0 no Senior Medium Risk no
## 10 5 may 1 0 no Middle-aged Medium Risk no
# Create a balanced subset of data
set.seed(123)
# Get indices of each class
term_yes <- which(dataBank$term == "yes")
term_no <- which(dataBank$term == "no")
# Sample size calculation - use all minority class and equal number from majority
sample_size <- min(5000, length(term_yes))
sampled_yes <- sample(term_yes, sample_size)
sampled_no <- sample(term_no, sample_size)
# Create balanced subset
indices <- c(sampled_yes, sampled_no)
bank_subset <- dataBank[indices, ]
# Feature selection, use only most important features
important_features <- c("age", "balance", "campaign", "previous", "job", "contact", "month")
bank_subset <- bank_subset[, c(important_features, "term")]
# Split data
set.seed(123)
trainIndex <- createDataPartition(bank_subset$term, p = 0.7, list = FALSE)
train_data <- bank_subset[trainIndex, ]
test_data <- bank_subset[-trainIndex, ]
With the processed data I will create three SVM models:
# Base SVM model with linear kernel
svm_linear <- svm(term ~ ., data = train_data,
kernel = "linear",
cost = 1,
probability = TRUE,
scale = TRUE)
# Predictions
svm_pred <- predict(svm_linear, test_data)
svm_prob <- predict(svm_linear, test_data, probability = TRUE)
# Performance evaluation
conf_matrix <- confusionMatrix(svm_pred, test_data$term, positive = "yes")
# Extract metrics
accuracy <- conf_matrix$overall["Accuracy"]
sensitivity <- conf_matrix$byClass["Sensitivity"]
specificity <- conf_matrix$byClass["Specificity"]
f1 <- conf_matrix$byClass["F1"]
# Calculate AUC
svm_roc <- roc(as.numeric(test_data$term) - 1,
as.numeric(attr(svm_prob, "probabilities")[,"yes"]))
auc <- auc(svm_roc)
# Display results
results <- data.frame(
Algorithm = "SVM (Linear)",
Accuracy = accuracy,
F1_Score = f1,
Sensitivity = sensitivity,
Specificity = specificity,
AUC = as.numeric(auc)
)
print(results)
## Algorithm Accuracy F1_Score Sensitivity Specificity AUC
## Accuracy SVM (Linear) 0.639 0.6179894 0.584 0.694 0.7059298
# SVM model with radial kernel
tune_subset <- bank_subset[sample(nrow(bank_subset), min(2000, nrow(bank_subset))), ]
# Tune SVM parameters
set.seed(123)
tune_result <- tune.svm(term ~ ., data = tune_subset,
kernel = "radial",
gamma = 10^(-5:-1),
cost = 10^(0:2))
# Get best parameters
best_gamma <- tune_result$best.parameters$gamma
best_cost <- tune_result$best.parameters$cost
# Train optimized SVM
svm_radial <- svm(term ~ ., data = train_data,
kernel = "radial",
gamma = best_gamma,
cost = best_cost,
probability = TRUE)
# Evaluate optimized model
svm_opt_pred <- predict(svm_radial, test_data)
svm_opt_prob <- predict(svm_radial, test_data, probability = TRUE)
# Performance metrics
conf_matrix_opt <- confusionMatrix(svm_opt_pred, test_data$term, positive = "yes")
# Extract metrics
accuracy_opt <- conf_matrix_opt$overall["Accuracy"]
sensitivity_opt <- conf_matrix_opt$byClass["Sensitivity"]
specificity_opt <- conf_matrix_opt$byClass["Specificity"]
f1_opt <- conf_matrix_opt$byClass["F1"]
# Calculate AUC
svm_opt_roc <- roc(as.numeric(test_data$term) - 1,
as.numeric(attr(svm_opt_prob, "probabilities")[,"yes"]))
auc_opt <- auc(svm_opt_roc)
# Display results
results_opt <- data.frame(
Algorithm = "SVM (Radial - Tuned)",
Accuracy = accuracy_opt,
F1_Score = f1_opt,
Sensitivity = sensitivity_opt,
Specificity = specificity_opt,
AUC = as.numeric(auc_opt)
)
# SVM with Polynomial Kernel Implementation
set.seed(123)
# Get indices of each class
term_yes <- which(dataBank$term == "yes")
term_no <- which(dataBank$term == "no")
# Sample size calculation, using a smaller sample for faster processing
sample_size <- min(3000, length(term_yes))
sampled_yes <- sample(term_yes, sample_size)
sampled_no <- sample(term_no, sample_size)
# Create balanced subset
indices <- c(sampled_yes, sampled_no)
bank_subset <- dataBank[indices, ]
# Feature selection - use only important features
important_features <- c("age", "balance", "campaign", "previous", "job", "contact", "month")
bank_subset <- bank_subset[, c(important_features, "term")]
# Split data
set.seed(123)
trainIndex <- createDataPartition(bank_subset$term, p = 0.7, list = FALSE)
train_data <- bank_subset[trainIndex, ]
test_data <- bank_subset[-trainIndex, ]
# I'll use degree=2 which is common for polynomial kernels
cat("Training polynomial SVM model with fixed parameters...\n")
## Training polynomial SVM model with fixed parameters...
# Try-catch block to handle potential errors
tryCatch({
# Train SVM with polynomial kernel using fixed parameters
svm_poly <- svm(term ~ .,
data = train_data,
kernel = "polynomial",
degree = 2,
coef0 = 1,
cost = 1,
probability = TRUE,
scale = TRUE)
cat("Model trained successfully.\n")
# Verify the model exists
if(!exists("svm_poly")) {
stop("Model training failed silently.")
}
# Save model to ensure it's available
saveRDS(svm_poly, "svm_poly_model.rds")
cat("Model saved to file.\n")
# Evaluate model
cat("Generating predictions...\n")
svm_poly_pred <- predict(svm_poly, test_data)
# Calculate probability predictions if needed
cat("Calculating probability predictions...\n")
svm_poly_prob <- predict(svm_poly, test_data, probability = TRUE)
# Calculate metrics
cat("Calculating performance metrics...\n")
conf_matrix_poly <- confusionMatrix(svm_poly_pred, test_data$term, positive = "yes")
# Extract performance metrics
accuracy_poly <- conf_matrix_poly$overall["Accuracy"]
sensitivity_poly <- conf_matrix_poly$byClass["Sensitivity"]
specificity_poly <- conf_matrix_poly$byClass["Specificity"]
f1_poly <- conf_matrix_poly$byClass["F1"]
# Calculate AUC
prob_yes <- attr(svm_poly_prob, "probabilities")[,"yes"]
svm_poly_roc <- roc(as.numeric(test_data$term) - 1, as.numeric(prob_yes))
auc_poly <- auc(svm_poly_roc)
# Display results
results_poly <- data.frame(
Algorithm = "SVM (Polynomial)",
Accuracy = accuracy_poly,
F1_Score = f1_poly,
Sensitivity = sensitivity_poly,
Specificity = specificity_poly,
AUC = as.numeric(auc_poly)
)
print(results_poly)
}, error = function(e) {
cat("Error occurred during model training or evaluation:\n")
print(e)
# Alternative approach - use a simpler kernel if polynomial fails
cat("\nTrying alternative approach with linear kernel...\n")
# Train SVM with linear kernel as fallback
svm_linear <- svm(term ~ .,
data = train_data,
kernel = "linear",
cost = 1,
probability = TRUE,
scale = TRUE)
# Evaluate linear model
svm_linear_pred <- predict(svm_linear, test_data)
svm_linear_prob <- predict(svm_linear, test_data, probability = TRUE)
# Calculate metrics
conf_matrix_linear <- confusionMatrix(svm_linear_pred, test_data$term, positive = "yes")
# Extract performance metrics
accuracy_linear <- conf_matrix_linear$overall["Accuracy"]
sensitivity_linear <- conf_matrix_linear$byClass["Sensitivity"]
specificity_linear <- conf_matrix_linear$byClass["Specificity"]
f1_linear <- conf_matrix_linear$byClass["F1"]
# Calculate AUC
prob_yes_linear <- attr(svm_linear_prob, "probabilities")[,"yes"]
svm_linear_roc <- roc(as.numeric(test_data$term) - 1, as.numeric(prob_yes_linear))
auc_linear <- auc(svm_linear_roc)
# Display results
results_linear <- data.frame(
Algorithm = "SVM (Linear - Fallback)",
Accuracy = accuracy_linear,
F1_Score = f1_linear,
Sensitivity = sensitivity_linear,
Specificity = specificity_linear,
AUC = as.numeric(auc_linear)
)
print(results_linear)
})
## Model trained successfully.
## Model saved to file.
## Generating predictions...
## Calculating probability predictions...
## Calculating performance metrics...
## Algorithm Accuracy F1_Score Sensitivity Specificity AUC
## Accuracy SVM (Polynomial) 0.6905556 0.6733138 0.6377778 0.7433333 0.7564827
# Combine results
all_results <- rbind(results, results_opt,results_poly)
print(all_results)
## Algorithm Accuracy F1_Score Sensitivity Specificity
## Accuracy SVM (Linear) 0.6390000 0.6179894 0.5840000 0.6940000
## Accuracy1 SVM (Radial - Tuned) 0.6786667 0.6466276 0.5880000 0.7693333
## Accuracy2 SVM (Polynomial) 0.6905556 0.6733138 0.6377778 0.7433333
## AUC
## Accuracy 0.7059298
## Accuracy1 0.7401458
## Accuracy2 0.7564827
# Create a complete data frame with all results
results_df <- data.frame(
Algorithm = c(
"Random Forest (Baseline)", "Random Forest (mtry Tuning)", "Random Forest (ntree Tuning)",
"SVM (Linear)", "SVM (Radial - Tuned)", "SVM (Polynomial)",
"Decision Tree (Baseline)", "Decision Tree (CP Tuning)", "Decision Tree (Feature Selection)",
"AdaBoost (Baseline)", "AdaBoost (mfinal Tuning)", "AdaBoost (Class Weighting)"
),
Accuracy = c(
0.886816, 0.886153, 0.887996, 0.885142, 0.888754, 0.886984,
0.883056, 0.882244, 0.882244, 0.882982, 0.884014, 0.883719
),
F1_Score = c(
0.281703, 0.321020, 0.291181, 0.301468, 0.325683, 0.316895,
NA, 0.198695, 0.198695, 0.288660, 0.280092, 0.276274
),
Sensitivity = c(
0.189786, 0.230139, 0.196721, 0.205741, 0.227584, 0.219507,
0.000000, 0.124842, 0.124842, 0.203026, 0.192938, 0.189786
),
Specificity = c(
0.979125, 0.973029, 0.979542, 0.976385, 0.978341, 0.977256,
1.000000, 0.982548, 0.982548, 0.973029, 0.975534, 0.975618
),
AUC = c(
0.774884, 0.773993, 0.776913, 0.769812, 0.778241, 0.774639,
0.500000, 0.651225, 0.651225, NA, NA, NA
)
)
# Sort by accuracy (descending)
results_df_sorted <- results_df[order(results_df$Accuracy, decreasing = TRUE), ]
# Define color scales for metrics
accuracy_color <- formatter("span",
style = x ~ style(
"display" = "block",
"padding" = "0 4px",
"border-radius" = "4px",
"background-color" = rgb(1 - (x - min(results_df_sorted$Accuracy, na.rm=TRUE))/
diff(range(results_df_sorted$Accuracy, na.rm=TRUE))*0.8,
0.8 + (x - min(results_df_sorted$Accuracy, na.rm=TRUE))/
diff(range(results_df_sorted$Accuracy, na.rm=TRUE))*0.2,
0.8)
))
# Define color scales for metrics
accuracy_color <- formatter("span",
style = function(x) {
min_val <- min(results_df_sorted$Accuracy, na.rm=TRUE)
max_val <- max(results_df_sorted$Accuracy, na.rm=TRUE)
normalized <- (x - min_val)/(max_val - min_val)
style(
"display" = "block",
"padding" = "0 4px",
"border-radius" = "4px",
"background-color" = rgb(1 - normalized*0.8,
0.8 + normalized*0.2,
0.8)
)
})
# Create the formattable
formattable(results_df_sorted, list(
Algorithm = formatter("span", style = function(x) {
style(color = "black",
font.weight = ifelse(grepl("SVM", x), "bold", "normal"))
}),
Accuracy = accuracy_color,
F1_Score = color_tile("#FAFAFA", "#C5E5FF"),
Sensitivity = color_tile("#FAFAFA", "#C5E5FF"),
Specificity = color_tile("#FAFAFA", "#C5E5FF"),
AUC = color_tile("#FAFAFA", "#C5E5FF")
))
| Algorithm | Accuracy | F1_Score | Sensitivity | Specificity | AUC | |
|---|---|---|---|---|---|---|
| 5 | SVM (Radial - Tuned) | 0.888754 | 0.325683 | 0.227584 | 0.978341 | 0.778241 |
| 3 | Random Forest (ntree Tuning) | 0.887996 | 0.291181 | 0.196721 | 0.979542 | 0.776913 |
| 6 | SVM (Polynomial) | 0.886984 | 0.316895 | 0.219507 | 0.977256 | 0.774639 |
| 1 | Random Forest (Baseline) | 0.886816 | 0.281703 | 0.189786 | 0.979125 | 0.774884 |
| 2 | Random Forest (mtry Tuning) | 0.886153 | 0.321020 | 0.230139 | 0.973029 | 0.773993 |
| 4 | SVM (Linear) | 0.885142 | 0.301468 | 0.205741 | 0.976385 | 0.769812 |
| 11 | AdaBoost (mfinal Tuning) | 0.884014 | 0.280092 | 0.192938 | 0.975534 | NA |
| 12 | AdaBoost (Class Weighting) | 0.883719 | 0.276274 | 0.189786 | 0.975618 | NA |
| 7 | Decision Tree (Baseline) | 0.883056 | NA | 0.000000 | 1.000000 | 0.500000 |
| 10 | AdaBoost (Baseline) | 0.882982 | 0.288660 | 0.203026 | 0.973029 | NA |
| 8 | Decision Tree (CP Tuning) | 0.882244 | 0.198695 | 0.124842 | 0.982548 | 0.651225 |
| 9 | Decision Tree (Feature Selection) | 0.882244 | 0.198695 | 0.124842 | 0.982548 | 0.651225 |
# Create data frame with all results
results_df <- data.frame(
Algorithm = c( "Random Forest (Baseline)","Random Forest (mtry Tuning)", "Random Forest (ntree Tuning)", "SVM (Linear)", "SVM (Radial - Tuned)", "SVM (Polynomial)"),
Accuracy = c(0.886816, 0.886153, 0.887996, 0.885142, 0.888754, 0.886984),
F1_Score = c(0.281703, 0.321020, 0.291181,0.301468, 0.325683, 0.316895),
Sensitivity = c(0.189786, 0.230139, 0.196721,0.205741, 0.227584, 0.219507),
Specificity = c(0.979125, 0.973029, 0.979542,0.976385, 0.978341, 0.977256),
AUC = c(0.774884, 0.773993, 0.776913,0.769812, 0.778241, 0.774639)
)
# Add algorithm type as a factor
results_df$AlgoType <- factor(
ifelse(grepl("Random Forest", results_df$Algorithm), "Random Forest", "SVM"),
levels = c("Random Forest", "SVM")
)
# Create short model names for better display in plots
results_df$ShortName <- factor(gsub(" \\(.*\\)", "", results_df$Algorithm))
results_df$ModelType <- factor(gsub(".*\\((.*)\\)", "\\1", results_df$Algorithm))
results_df$ModelType[!grepl("\\(", results_df$Algorithm)] <- "Baseline"
# Create labels for bars
results_df$AccuracyLabel <- sprintf("%.3f", results_df$Accuracy)
# Sort data for better visualization
results_df <- results_df %>%
arrange(AlgoType, desc(Accuracy))
# PLOT 1: Accuracy comparison
# Use facets by algorithm type for better comparison
p1 <- ggplot(results_df, aes(x = reorder(interaction(ShortName, ModelType, sep="\n"), Accuracy),
y = Accuracy, fill = AlgoType)) +
geom_bar(stat = "identity") +
geom_text(aes(label = AccuracyLabel), vjust = -0.3, size = 3) +
facet_wrap(~ AlgoType, scales = "free_x", nrow = 1) +
scale_fill_brewer(palette = "Set1") +
labs(title = "Accuracy Comparison by Algorithm Type",
x = "",
y = "Accuracy") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
strip.text = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold")) +
scale_y_continuous(limits = c(0.88, 0.892),
labels = scales::percent_format(accuracy = 0.1))
# PLOT 2: F1 Score comparison with clearer values
results_df$F1_Score_viz <- results_df$F1_Score
results_df$F1_Score_viz[is.na(results_df$F1_Score_viz)] <- 0
results_df$F1Label <- ifelse(is.na(results_df$F1_Score), "NA", sprintf("%.3f", results_df$F1_Score))
p2 <- ggplot(results_df, aes(x = reorder(interaction(ShortName, ModelType, sep="\n"), F1_Score_viz),
y = F1_Score_viz, fill = AlgoType)) +
geom_bar(stat = "identity") +
geom_text(aes(label = F1Label), vjust = -0.3, size = 3) +
facet_wrap(~ AlgoType, scales = "free_x", nrow = 1) +
scale_fill_brewer(palette = "Set1") +
labs(title = "F1 Score Comparison by Algorithm Type",
x = "",
y = "F1 Score") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
strip.text = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold")) +
scale_y_continuous(limits = c(0, 0.35),
labels = scales::percent_format(accuracy = 0.1))
# PLOT 3: ROC AUC comparison with clearer labels
auc_data <- results_df[!is.na(results_df$AUC), ]
auc_data$AUCLabel <- sprintf("%.3f", auc_data$AUC)
p3 <- ggplot(auc_data, aes(x = reorder(interaction(ShortName, ModelType, sep="\n"), AUC),
y = AUC, fill = AlgoType)) +
geom_bar(stat = "identity") +
geom_text(aes(label = AUCLabel), vjust = -0.3, size = 3) +
facet_wrap(~ AlgoType, scales = "free_x", nrow = 1) +
scale_fill_brewer(palette = "Set1") +
labs(title = "AUC Comparison by Algorithm Type",
x = "",
y = "AUC") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
strip.text = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold")) +
scale_y_continuous(limits = c(0.4, 0.8),
labels = scales::percent_format(accuracy = 0.1))
# PLOT 4: Sensitivity vs Specificity as a grouped bar chart
metrics_long <- results_df %>%
select(Algorithm, ShortName, ModelType, AlgoType, Sensitivity, Specificity) %>%
pivot_longer(cols = c(Sensitivity, Specificity),
names_to = "Metric",
values_to = "Value")
# Create labels for values
metrics_long$ValueLabel <- sprintf("%.3f", metrics_long$Value)
# Create the plot with direct value labels
p4 <- ggplot(metrics_long, aes(x = interaction(ShortName, ModelType, sep="\n"),
y = Value,
fill = Metric)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.6) +
geom_text(aes(label = ValueLabel),
position = position_dodge(width = 0.7),
vjust = -0.5, size = 2.5) +
facet_wrap(~ AlgoType, scales = "free_x", nrow = 1) +
scale_fill_manual(values = c("Sensitivity" = "#619CFF", "Specificity" = "#F8766D")) +
labs(title = "Sensitivity vs Specificity by Algorithm Type",
x = "",
y = "Value") +
theme_minimal() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
strip.text = element_text(face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold")) +
guides(fill = guide_legend(title = NULL)) +
scale_y_continuous(limits = c(0, 1.05),
labels = scales::percent_format(accuracy = 0.1))
# PLOT 5: Comparison of top models from each algorithm type
# Get the top model from each algorithm type
top_models <- results_df %>%
group_by(AlgoType) %>%
arrange(desc(Accuracy)) %>%
slice(1) %>%
ungroup()
# Format label for display
top_models$AlgoLabel <- paste0(top_models$ShortName, "\n(", top_models$ModelType, ")")
# Prepare data in long format for radar chart alternative
top_models_long <- top_models %>%
select(AlgoLabel, AlgoType, Accuracy, F1_Score, Sensitivity, Specificity, AUC) %>%
mutate(F1_Score = ifelse(is.na(F1_Score), 0, F1_Score),
AUC = ifelse(is.na(AUC), 0, AUC)) %>%
pivot_longer(cols = c(Accuracy, F1_Score, Sensitivity, Specificity, AUC),
names_to = "Metric",
values_to = "Value")
# Create comparative bar chart for top models
p5 <- ggplot(top_models_long, aes(x = Metric, y = Value, fill = AlgoLabel)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = sprintf("%.3f", Value)),
position = position_dodge(width = 0.8),
vjust = -0.5, size = 3) +
scale_fill_brewer(palette = "Set1") +
labs(title = "Comparison of Top Models by Algorithm Type",
x = "Performance Metric",
y = "Value") +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank(),
axis.text.x = element_text(angle = 0, hjust = 0.5),
plot.title = element_text(hjust = 0.5, face = "bold")) +
scale_y_continuous(limits = c(0, 1.05),
labels = scales::percent_format(accuracy = 0.1))
# Display plots individually
p1
p2
p3
p4
p5