Data Loading & Basic EDA
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(corrplot)
library(ggcorrplot)
library(readr)
# Read the CSV file using semicolon as the delimiter
df <- read_delim("/Users/willberritt/Downloads/bank+marketing/bank/bank-full.csv", delim = ";")
# Check the first few rows
head(df)
## # A tibble: 6 × 17
## age job marital education default balance housing loan contact day
## <dbl> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 58 management married tertiary no 2143 yes no unknown 5
## 2 44 technician single secondary no 29 yes no unknown 5
## 3 33 entrepren… married secondary no 2 yes yes unknown 5
## 4 47 blue-coll… married unknown no 1506 yes no unknown 5
## 5 33 unknown single unknown no 1 no no unknown 5
## 6 35 management married tertiary no 231 yes no unknown 5
## # ℹ 7 more variables: month <chr>, duration <dbl>, campaign <dbl>, pdays <dbl>,
## # previous <dbl>, poutcome <chr>, y <chr>
unique(df$y)
## [1] "no" "yes"
# Check structure and summary
str(df)
## spc_tbl_ [45,211 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:45211] 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr [1:45211] "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr [1:45211] "married" "single" "married" "married" ...
## $ education: chr [1:45211] "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr [1:45211] "no" "no" "no" "no" ...
## $ balance : num [1:45211] 2143 29 2 1506 1 ...
## $ housing : chr [1:45211] "yes" "yes" "yes" "yes" ...
## $ loan : chr [1:45211] "no" "no" "yes" "no" ...
## $ contact : chr [1:45211] "unknown" "unknown" "unknown" "unknown" ...
## $ day : num [1:45211] 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr [1:45211] "may" "may" "may" "may" ...
## $ duration : num [1:45211] 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : num [1:45211] 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : num [1:45211] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : num [1:45211] 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr [1:45211] "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr [1:45211] "no" "no" "no" "no" ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. job = col_character(),
## .. marital = col_character(),
## .. education = col_character(),
## .. default = col_character(),
## .. balance = col_double(),
## .. housing = col_character(),
## .. loan = col_character(),
## .. contact = col_character(),
## .. day = col_double(),
## .. month = col_character(),
## .. duration = col_double(),
## .. campaign = col_double(),
## .. pdays = col_double(),
## .. previous = col_double(),
## .. poutcome = col_character(),
## .. y = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(df)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
# Check for values that equal 'unknown'
colSums(df == 'unknown')
## age job marital education default balance housing loan
## 0 288 0 1857 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 13020 0 0 0 0 0 0 36959
## y
## 0
# Check for values that equal 0
colSums(df == 0)
## age job marital education default balance housing loan
## 0 0 0 0 0 3514 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 3 0 0 36954 0
## y
## 0
# Count unique values in categorical variables
sapply(df, function(x) length(unique(x)))
## age job marital education default balance housing loan
## 77 12 3 4 2 7168 2 2
## contact day month duration campaign pdays previous poutcome
## 3 31 12 1573 48 559 41 4
## y
## 2
Data Exploration
# Compute correlation matrix for numeric features
numeric_vars <- df %>% select_if(is.numeric)
cor_matrix <- cor(numeric_vars, use = "complete.obs")
# Plot correlation heatmap
ggcorrplot(cor_matrix, method = "circle", lab = TRUE)
# Histogram for numerical features
num_cols <- colnames(numeric_vars)
par(mfrow=c(3,3)) # Arrange plots in a grid
for (col in num_cols) {
hist(df[[col]], main=paste("Histogram of", col), col="skyblue", breaks=30)
}
par(mfrow=c(1,1)) # Reset plot layout
# Boxplots for outlier detection
for (col in num_cols) {
boxplot(df[[col]], main=paste("Boxplot of", col), col="orange")
}
# Count plots for categorical variables
cat_vars <- df %>% select_if(is.character)
for (col in colnames(cat_vars)) {
print(ggplot(df, aes_string(x = col)) +
geom_bar(fill="steelblue") +
theme_minimal() +
ggtitle(paste("Distribution of", col)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)))
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Count of target variable
table(df$y)
##
## no yes
## 39922 5289
# Boxplot of numerical variables vs target variable
for (col in num_cols) {
print(ggplot(df, aes_string(x = "y", y = col)) +
geom_boxplot(fill="lightblue") +
theme_minimal() +
ggtitle(paste("Boxplot of", col, "by y")))
}
# Categorical variables vs target variable
for (col in colnames(cat_vars)) {
print(ggplot(df, aes_string(x = col, fill = "y")) +
geom_bar(position = "fill") +
theme_minimal() +
ggtitle(paste("Proportion of y by", col)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)))
}
# Define color mapping for 'y'
colors <- ifelse(df$y == "yes", "blue", "red") # 'yes' -> blue, 'no' -> red
# Create pairwise scatter plots with color mapping
pairs(numeric_vars, col = colors, pch = 19)
Data Transformation & Considerations
# Data Cleaning - improve data quality, address missing data, etc.
# I don't think much is needed here. I acknowledge that there are some features with a substantial number of 'missing' data with records that read as 'unknown'. However, I think the records with 'unknown' may be valuable in increasing model accuracy as they differ from the sample mean distribution in terms of how many 'no's and 'yes's there are.
# Feature Engineering - use of business knowledge to create new features
# I've created three features: age_group, balance_category, loan_housing_interaction. Feature engineering is a bit of a guessing game. You can use business logic but sometimes data isn't backed by business logic. Often the business logic is informed by data and unique datasets can have unique patterns that aren't well-modeled by conventional business logic.
# 1. Age Grouping (Young, Middle-aged, Senior)
df$age_group <- cut(df$age,
breaks = c(18, 30, 50, 100),
labels = c("Young", "Middle-aged", "Senior"),
right = FALSE)
# 2. Balance Categorization (Low, Medium, High)
df$balance_category <- cut(df$balance,
breaks = c(-Inf, 0, 1000, 5000, Inf),
labels = c("Negative", "Low", "Medium", "High"))
# 3. Loan and Housing Interaction
df$loan_housing_interaction <- ifelse(df$housing == "no" & df$loan == "no", "No loans",
ifelse(df$housing == "yes" & df$loan == "no", "Only housing loan",
ifelse(df$housing == "no" & df$loan == "yes", "Only personal loan", "Both loans")))
# Check the first few rows to see the new features
head(df[, c("age_group", "balance_category", "loan_housing_interaction")])
## # A tibble: 6 × 3
## age_group balance_category loan_housing_interaction
## <fct> <fct> <chr>
## 1 Senior Medium Only housing loan
## 2 Middle-aged Low Only housing loan
## 3 Middle-aged Low Both loans
## 4 Middle-aged Medium Only housing loan
## 5 Middle-aged Low No loans
## 6 Middle-aged Low Only housing loan
# Sampling Data - using sampling to resize datasets
# Sampling is a bit tricky so I'd be tempted to build the models and evaluate them before I do any sampling. Removing records or adding records should by default not be the objective because in theory, there's info in these records but perhaps would be a necessary step depending on performance.
# Data Transformation - regularization, normalization, handling categorical variables
# 1. Z-score transformation for 'balance' column
df$balance_zscore <- (df$balance - mean(df$balance, na.rm = TRUE)) / sd(df$balance, na.rm = TRUE)
# 2. Z-score transformation for 'duration' column
df$duration_zscore <- (df$duration - mean(df$duration, na.rm = TRUE)) / sd(df$duration, na.rm = TRUE)
# Check the first few rows to see the transformed columns
head(df)
## # A tibble: 6 × 22
## age job marital education default balance housing loan contact day
## <dbl> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 58 management married tertiary no 2143 yes no unknown 5
## 2 44 technician single secondary no 29 yes no unknown 5
## 3 33 entrepren… married secondary no 2 yes yes unknown 5
## 4 47 blue-coll… married unknown no 1506 yes no unknown 5
## 5 33 unknown single unknown no 1 no no unknown 5
## 6 35 management married tertiary no 231 yes no unknown 5
## # ℹ 12 more variables: month <chr>, duration <dbl>, campaign <dbl>,
## # pdays <dbl>, previous <dbl>, poutcome <chr>, y <chr>, age_group <fct>,
## # balance_category <fct>, loan_housing_interaction <chr>,
## # balance_zscore <dbl>, duration_zscore <dbl>
# Look at distribution for newly formed z-score columns
numeric_vars <- df %>% select_if(is.numeric)
num_cols <- colnames(numeric_vars)
num_cols
## [1] "age" "balance" "day" "duration"
## [5] "campaign" "pdays" "previous" "balance_zscore"
## [9] "duration_zscore"
par(mfrow=c(3,3)) # Arrange plots in a grid
for (col in num_cols) {
hist(df[[col]], main=paste("Histogram of", col), col="skyblue", breaks=30)
}
# Imbalanced Data - reducing the imbalance between classes
# Think this may be needed but should test model performance without removing data first, ideally!
Modeling
# Load necessary libraries
library(rpart) # For decision tree models
library(randomForest) # For random forest models
library(caret) # For cross-validation and model tuning
library(xgboost) # For gradient boosting
library(dplyr) # Data manipulation
library(tidyr)
library(data.table)
library(pROC) # ROC and AUC metrics
# Ensure target variable 'y' is a factor for classification
df$y <- as.factor(df$y)
# -----------------------------------------------------
# PREPROCESSING & BASELINE DECISION TREE
# -----------------------------------------------------
# Convert relevant columns to factors for modeling
df_rpart <- df %>%
mutate(across(c(job, marital, education, default, housing, loan,
contact, month, poutcome, age_group,
balance_category, loan_housing_interaction), as.factor))
# Split into train/test sets (80/20 split)
trainIndex <- sample(1:nrow(df_rpart), 0.8 * nrow(df_rpart))
trainData <- df_rpart[trainIndex, ]
testData <- df_rpart[-trainIndex, ]
# Baseline Decision Tree with all available features
# Purpose: Establish a benchmark performance for tree-based models
dt_model <- rpart(y ~ ., data = trainData, method = "class")
predictions <- predict(dt_model, newdata = testData, type = "class")
conf_matrix <- table(Predicted = predictions, Actual = testData$y)
print(conf_matrix)
## Actual
## Predicted no yes
## no 7777 708
## yes 208 350
cat("Accuracy:", sum(diag(conf_matrix)) / sum(conf_matrix), "\n")
## Accuracy: 0.8987062
# -----------------------------------------------------
# DECISION TREE - Using Only Original Features
# -----------------------------------------------------
# Purpose: Compare how much engineered features (like z-scores and groupings)
# contribute to model performance
df_rpart_original <- df_rpart %>%
select(-age_group, -balance_category, -loan_housing_interaction, -balance_zscore, -duration_zscore)
# Repeat train/test split
trainIndex <- sample(1:nrow(df_rpart_original), 0.8 * nrow(df_rpart_original))
trainData <- df_rpart_original[trainIndex, ]
testData <- df_rpart_original[-trainIndex, ]
# Decision tree with only original variables (no feature engineering)
dt_model_original <- rpart(y ~ ., data = trainData, method = "class")
predictions <- predict(dt_model_original, newdata = testData, type = "class")
conf_matrix <- table(Predicted = predictions, Actual = testData$y)
print(conf_matrix)
## Actual
## Predicted no yes
## no 7775 697
## yes 203 368
cat("Accuracy:", sum(diag(conf_matrix)) / sum(conf_matrix), "\n")
## Accuracy: 0.9004755
# -----------------------------------------------------
# DECISION TREE - Pruned with max depth
# -----------------------------------------------------
# Purpose: Investigate effect of model complexity control via max depth
dt_model1 <- rpart(y ~ ., data = trainData, control = rpart.control(maxdepth = 10))
dt_pred1 <- predict(dt_model1, testData, type = "class")
conf_matrix1 <- table(Predicted = dt_pred1, Actual = testData$y)
print(conf_matrix1)
## Actual
## Predicted no yes
## no 7775 697
## yes 203 368
cat("Decision Tree (max_depth=10) Accuracy:", round(sum(dt_pred1 == testData$y) / nrow(testData), 4), "\n")
## Decision Tree (max_depth=10) Accuracy: 0.9005
# -----------------------------------------------------
# RANDOM FOREST - Baseline
# -----------------------------------------------------
# Purpose: Improve accuracy by using ensemble of trees (bagging)
rf_model <- randomForest(y ~ ., data = trainData)
rf_predictions <- predict(rf_model, newdata = testData)
confusion_matrix <- confusionMatrix(rf_predictions, testData$y)
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7672 539
## yes 306 526
##
## Accuracy : 0.9066
## 95% CI : (0.9004, 0.9125)
## No Information Rate : 0.8822
## P-Value [Acc > NIR] : 7.260e-14
##
## Kappa : 0.5032
##
## Mcnemar's Test P-Value : 1.451e-15
##
## Sensitivity : 0.9616
## Specificity : 0.4939
## Pos Pred Value : 0.9344
## Neg Pred Value : 0.6322
## Prevalence : 0.8822
## Detection Rate : 0.8484
## Detection Prevalence : 0.9080
## Balanced Accuracy : 0.7278
##
## 'Positive' Class : no
##
cat("Random Forest Accuracy:", confusion_matrix$overall['Accuracy'], "\n")
## Random Forest Accuracy: 0.9065576
# -----------------------------------------------------
# OUTLIER REMOVAL (IQR) FOR NUMERICAL FIELDS
# -----------------------------------------------------
# Purpose: Evaluate whether removing outliers improves model robustness
remove_outliers <- function(df) {
for (col in colnames(df)) {
Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
IQR_value <- IQR(df[[col]], na.rm = TRUE)
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
df <- df %>% filter(df[[col]] >= lower_bound & df[[col]] <= upper_bound)
}
return(df)
}
numerical_columns <- sapply(df_rpart, is.numeric)
df_numeric <- df_rpart[, numerical_columns]
df_no_outliers <- remove_outliers(df_numeric)
df_rpart_no_outliers <- df_rpart[rownames(df_no_outliers), ]
# -----------------------------------------------------
# RANDOM FOREST - After Removing Outliers
# -----------------------------------------------------
# Purpose: Compare performance of Random Forest on cleaner data
trainIndex <- sample(1:nrow(df_rpart_no_outliers), 0.8 * nrow(df_rpart_no_outliers))
trainData <- df_rpart_no_outliers[trainIndex, ]
testData <- df_rpart_no_outliers[-trainIndex, ]
rf_model <- randomForest(y ~ ., data = trainData)
rf_predictions <- predict(rf_model, newdata = testData)
confusion_matrix <- confusionMatrix(rf_predictions, testData$y)
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 4740 139
## yes 74 110
##
## Accuracy : 0.9579
## 95% CI : (0.952, 0.9633)
## No Information Rate : 0.9508
## P-Value [Acc > NIR] : 0.009324
##
## Kappa : 0.4866
##
## Mcnemar's Test P-Value : 1.159e-05
##
## Sensitivity : 0.9846
## Specificity : 0.4418
## Pos Pred Value : 0.9715
## Neg Pred Value : 0.5978
## Prevalence : 0.9508
## Detection Rate : 0.9362
## Detection Prevalence : 0.9637
## Balanced Accuracy : 0.7132
##
## 'Positive' Class : no
##
cat("Random Forest (no outliers) Accuracy:", confusion_matrix$overall['Accuracy'], "\n")
## Random Forest (no outliers) Accuracy: 0.9579301
# -----------------------------------------------------
# RANDOM FOREST - More Trees (ntree = 5000)
# -----------------------------------------------------
# Purpose: Test if increasing number of trees improves stability/performance
rf_model <- randomForest(y ~ ., data = trainData, ntree = 5000)
rf_predictions <- predict(rf_model, newdata = testData)
confusion_matrix <- confusionMatrix(rf_predictions, testData$y)
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 4738 142
## yes 76 107
##
## Accuracy : 0.9569
## 95% CI : (0.951, 0.9624)
## No Information Rate : 0.9508
## P-Value [Acc > NIR] : 0.02207
##
## Kappa : 0.4734
##
## Mcnemar's Test P-Value : 1.071e-05
##
## Sensitivity : 0.9842
## Specificity : 0.4297
## Pos Pred Value : 0.9709
## Neg Pred Value : 0.5847
## Prevalence : 0.9508
## Detection Rate : 0.9358
## Detection Prevalence : 0.9639
## Balanced Accuracy : 0.7070
##
## 'Positive' Class : no
##
cat("Random Forest (ntree=5000) Accuracy:", confusion_matrix$overall['Accuracy'], "\n")
## Random Forest (ntree=5000) Accuracy: 0.9569425
# -----------------------------------------------------
# XGBOOST - BASIC MODEL
# -----------------------------------------------------
# Purpose: Try boosting method, which often outperforms bagging (RF)
df <- df_rpart_no_outliers %>%
mutate(across(c(job, marital, education, default, housing, loan,
contact, day, month, poutcome, y), as.factor))
df_encoded <- df %>%
select(-y) %>%
mutate_if(is.factor, as.character) %>%
model.matrix(~ . - 1, data = .) %>%
data.frame()
y <- as.numeric(df$y) - 1
X <- as.matrix(df_encoded)
dtrain <- xgb.DMatrix(data = X, label = y)
params <- list(
objective = "binary:logistic",
eval_metric = "error",
eta = 0.1,
max_depth = 6
)
# Train XGBoost with basic parameters
xgb_model <- xgb.train(params = params, data = dtrain, nrounds = 100)
preds <- predict(xgb_model, X)
pred_labels <- ifelse(preds > 0.5, 1, 0)
accuracy <- mean(pred_labels == y)
cat("XGBoost (basic) Accuracy:", accuracy, "\n")
## XGBoost (basic) Accuracy: 0.9730199
conf_matrix <- confusionMatrix(as.factor(pred_labels), as.factor(y))
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 23919 531
## 1 152 713
##
## Accuracy : 0.973
## 95% CI : (0.9709, 0.975)
## No Information Rate : 0.9509
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6625
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9937
## Specificity : 0.5732
## Pos Pred Value : 0.9783
## Neg Pred Value : 0.8243
## Prevalence : 0.9509
## Detection Rate : 0.9449
## Detection Prevalence : 0.9658
## Balanced Accuracy : 0.7834
##
## 'Positive' Class : 0
##
precision <- conf_matrix$byClass["Pos Pred Value"]
recall <- conf_matrix$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("XGBoost Precision:", precision, "\n")
## XGBoost Precision: 0.9782822
cat("XGBoost Recall:", recall, "\n")
## XGBoost Recall: 0.9936853
cat("XGBoost F1-Score:", f1_score, "\n")
## XGBoost F1-Score: 0.9859236
# -----------------------------------------------------
# XGBOOST - CROSS VALIDATION WITH CARET
# -----------------------------------------------------
params <- list(
objective = "binary:logistic", # Binary classification
eval_metric = "logloss", # Evaluation metric for cross-validation
eta = 0.1, # Learning rate
max_depth = 6, # Maximum depth of trees
subsample = 0.8, # Subsample ratio of the training instance
colsample_bytree = 0.8 # Subsample ratio of columns when constructing each tree
)
# Perform cross-validation
cv_results <- xgb.cv(
params = params,
data = dtrain,
nrounds = 100, # Number of boosting rounds
nfold = 5, # Number of folds for cross-validation
showsd = TRUE, # Display the standard deviation of each metric
stratified = TRUE, # Stratified sampling to preserve class distribution
print_every_n = 10, # Print results every 10 rounds
early_stopping_rounds = 10, # Stop early if there is no improvement
verbose = 1 # Display progress
)
## [1] train-logloss:0.608837+0.000229 test-logloss:0.609610+0.000478
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
##
## [11] train-logloss:0.234481+0.000441 test-logloss:0.241286+0.002251
## [21] train-logloss:0.132791+0.001382 test-logloss:0.144123+0.003650
## [31] train-logloss:0.096338+0.000781 test-logloss:0.111732+0.003524
## [41] train-logloss:0.082204+0.000544 test-logloss:0.100913+0.003555
## [51] train-logloss:0.075684+0.000768 test-logloss:0.096968+0.003582
## [61] train-logloss:0.072295+0.000892 test-logloss:0.096070+0.003748
## Stopping. Best iteration:
## [59] train-logloss:0.072854+0.000883 test-logloss:0.096028+0.003678
# Extract the best number of boosting rounds from the cross-validation results
best_nrounds <- cv_results$best_iteration
cat("Best number of rounds: ", best_nrounds, "\n")
## Best number of rounds: 59
# Use the best number of rounds to train the final model
final_model <- xgb.train(
params = params,
data = dtrain,
nrounds = best_nrounds
)
# Make predictions on the training data
preds <- predict(final_model, X)
# Convert probabilities to binary outcomes (0 or 1)
pred_labels <- ifelse(preds > 0.5, 1, 0)
# Calculate accuracy
accuracy <- mean(pred_labels == y)
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.9692672
# Calculate precision, recall, F1-score, confusion matrix, and ROC-AUC
conf_matrix <- confusionMatrix(as.factor(pred_labels), as.factor(y))
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 23881 588
## 1 190 656
##
## Accuracy : 0.9693
## 95% CI : (0.9671, 0.9714)
## No Information Rate : 0.9509
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6123
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9921
## Specificity : 0.5273
## Pos Pred Value : 0.9760
## Neg Pred Value : 0.7754
## Prevalence : 0.9509
## Detection Rate : 0.9434
## Detection Prevalence : 0.9666
## Balanced Accuracy : 0.7597
##
## 'Positive' Class : 0
##
# Precision, Recall, F1-Score
precision <- conf_matrix$byClass["Pos Pred Value"]
recall <- conf_matrix$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Precision: ", precision, "\n")
## Precision: 0.9759696
cat("Recall: ", recall, "\n")
## Recall: 0.9921067
cat("F1-Score: ", f1_score, "\n")
## F1-Score: 0.983972
# ROC-AUC
roc_curve <- roc(y, preds)
auc_value <- auc(roc_curve)
cat("ROC-AUC: ", auc_value, "\n")
## ROC-AUC: 0.981444
# Load necessary libraries
library(e1071)
library(caret)
library(pROC)
# Define class weights to handle imbalance
class_weights <- c("no" = 1, "yes" = 19)
# Function to evaluate model
evaluate_model <- function(model, testData, model_name = "SVM") {
preds <- predict(model, testData, probability = TRUE)
probs <- attr(preds, "probabilities")[, "yes"]
cm <- confusionMatrix(preds, testData$y)
roc_obj <- roc(response = testData$y, predictor = probs, levels = c("no", "yes"))
auc_val <- auc(roc_obj)
f1 <- 2 * (cm$byClass["Pos Pred Value"] * cm$byClass["Sensitivity"]) /
(cm$byClass["Pos Pred Value"] + cm$byClass["Sensitivity"])
cat("\n---", model_name, "---\n")
cat("Accuracy:", cm$overall["Accuracy"], "\n")
cat("Precision:", cm$byClass["Pos Pred Value"], "\n")
cat("Recall:", cm$byClass["Sensitivity"], "\n")
cat("F1-Score:", f1, "\n")
cat("ROC-AUC:", auc_val, "\n")
plot(roc_obj, main = paste(model_name, "- ROC Curve"), col = "blue")
}
# Linear Kernel SVM
svm_linear <- svm(
y ~ .,
data = trainData,
type = 'C-classification',
kernel = 'linear',
class.weights = class_weights,
probability = TRUE
)
evaluate_model(svm_linear, testData, "Linear SVM")
##
## --- Linear SVM ---
## Accuracy: 0.9561525
## Precision: 0.965154
## Recall: 0.9896136
## F1-Score: 0.9772308
## ROC-AUC: 0.9639122
# RBF Kernel SVM
svm_rbf <- svm(
y ~ .,
data = trainData,
type = 'C-classification',
kernel = 'radial',
class.weights = class_weights,
probability = TRUE
)
evaluate_model(svm_rbf, testData, "RBF SVM")
##
## --- RBF SVM ---
## Accuracy: 0.9514122
## Precision: 0.9621611
## Recall: 0.9877441
## F1-Score: 0.9747847
## ROC-AUC: 0.9543942
# Polynomial Kernel SVM
svm_poly <- svm(
y ~ .,
data = trainData,
type = 'C-classification',
kernel = 'polynomial',
degree = 3,
class.weights = class_weights,
probability = TRUE
)
evaluate_model(svm_poly, testData, "Polynomial SVM")
##
## --- Polynomial SVM ---
## Accuracy: 0.95635
## Precision: 0.9655382
## Recall: 0.9894059
## F1-Score: 0.9773264
## ROC-AUC: 0.9241607
# Radial kernel SVM with fixed hyperparameters
svm_rbf_model <- svm(
y ~ .,
data = trainData,
type = 'C-classification',
kernel = 'radial',
cost = 1,
gamma = 0.1,
class.weights = class_weights,
probability = TRUE
)
evaluate_model(svm_rbf_model, testData, "Polynomial SVM")
##
## --- Polynomial SVM ---
## Accuracy: 0.9484495
## Precision: 0.9607367
## Recall: 0.9860823
## F1-Score: 0.9732445
## ROC-AUC: 0.9465623
# Polynomial kernel SVM with extreme hyperparameters (opposite spectrum)
svm_poly_extreme_model <- svm(
y ~ .,
data = trainData,
type = 'C-classification',
kernel = 'polynomial',
degree = 5, # Higher degree = more complex boundary
cost = 1000, # High cost = low tolerance for misclassification
coef0 = 10, # Affects influence of higher-order terms
gamma = 10, # High gamma = very sensitive to training points
class.weights = NULL, # No weighting
probability = TRUE
)
evaluate_model(svm_poly_extreme_model, testData, "Extreme Polynomial SVM")
##
## --- Extreme Polynomial SVM ---
## Accuracy: 0.9512147
## Precision: 0.9511954
## Recall: 1
## F1-Score: 0.9749873
## ROC-AUC: 0.7774179
# Polynomial kernel SVM with moderate hyperparameters (middle ground)
svm_poly_moderate_model <- svm(
y ~ .,
data = trainData,
type = 'C-classification',
kernel = 'polynomial',
degree = 3, # Cubic decision boundary, but not overly complex
cost = 10, # Balanced penalty for misclassification
coef0 = 1, # Default/neutral influence of higher-order terms
gamma = 0.5, # Moderate sensitivity to data
class.weights = class_weights,
probability = TRUE
)
evaluate_model(svm_poly_moderate_model, testData, "Moderate Polynomial SVM")
##
## --- Moderate Polynomial SVM ---
## Accuracy: 0.9490421
## Precision: 0.9528827
## Recall: 0.9956377
## F1-Score: 0.9737911
## ROC-AUC: 0.8136418