Data Loading & Basic EDA

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(corrplot)
library(ggcorrplot)
library(readr)

# Read the CSV file using semicolon as the delimiter
df <- read_delim("/Users/willberritt/Downloads/bank+marketing/bank/bank-full.csv", delim = ";")

# Check the first few rows
head(df)
## # A tibble: 6 × 17
##     age job        marital education default balance housing loan  contact   day
##   <dbl> <chr>      <chr>   <chr>     <chr>     <dbl> <chr>   <chr> <chr>   <dbl>
## 1    58 management married tertiary  no         2143 yes     no    unknown     5
## 2    44 technician single  secondary no           29 yes     no    unknown     5
## 3    33 entrepren… married secondary no            2 yes     yes   unknown     5
## 4    47 blue-coll… married unknown   no         1506 yes     no    unknown     5
## 5    33 unknown    single  unknown   no            1 no      no    unknown     5
## 6    35 management married tertiary  no          231 yes     no    unknown     5
## # ℹ 7 more variables: month <chr>, duration <dbl>, campaign <dbl>, pdays <dbl>,
## #   previous <dbl>, poutcome <chr>, y <chr>
unique(df$y)
## [1] "no"  "yes"
# Check structure and summary
str(df)
## spc_tbl_ [45,211 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age      : num [1:45211] 58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : chr [1:45211] "management" "technician" "entrepreneur" "blue-collar" ...
##  $ marital  : chr [1:45211] "married" "single" "married" "married" ...
##  $ education: chr [1:45211] "tertiary" "secondary" "secondary" "unknown" ...
##  $ default  : chr [1:45211] "no" "no" "no" "no" ...
##  $ balance  : num [1:45211] 2143 29 2 1506 1 ...
##  $ housing  : chr [1:45211] "yes" "yes" "yes" "yes" ...
##  $ loan     : chr [1:45211] "no" "no" "yes" "no" ...
##  $ contact  : chr [1:45211] "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : num [1:45211] 5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr [1:45211] "may" "may" "may" "may" ...
##  $ duration : num [1:45211] 261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : num [1:45211] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : num [1:45211] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : num [1:45211] 0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr [1:45211] "unknown" "unknown" "unknown" "unknown" ...
##  $ y        : chr [1:45211] "no" "no" "no" "no" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   job = col_character(),
##   ..   marital = col_character(),
##   ..   education = col_character(),
##   ..   default = col_character(),
##   ..   balance = col_double(),
##   ..   housing = col_character(),
##   ..   loan = col_character(),
##   ..   contact = col_character(),
##   ..   day = col_double(),
##   ..   month = col_character(),
##   ..   duration = col_double(),
##   ..   campaign = col_double(),
##   ..   pdays = col_double(),
##   ..   previous = col_double(),
##   ..   poutcome = col_character(),
##   ..   y = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(df)
##       age            job              marital           education        
##  Min.   :18.00   Length:45211       Length:45211       Length:45211      
##  1st Qu.:33.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.94                                                           
##  3rd Qu.:48.00                                                           
##  Max.   :95.00                                                           
##    default             balance         housing              loan          
##  Length:45211       Min.   : -8019   Length:45211       Length:45211      
##  Class :character   1st Qu.:    72   Class :character   Class :character  
##  Mode  :character   Median :   448   Mode  :character   Mode  :character  
##                     Mean   :  1362                                        
##                     3rd Qu.:  1428                                        
##                     Max.   :102127                                        
##    contact               day           month              duration     
##  Length:45211       Min.   : 1.00   Length:45211       Min.   :   0.0  
##  Class :character   1st Qu.: 8.00   Class :character   1st Qu.: 103.0  
##  Mode  :character   Median :16.00   Mode  :character   Median : 180.0  
##                     Mean   :15.81                      Mean   : 258.2  
##                     3rd Qu.:21.00                      3rd Qu.: 319.0  
##                     Max.   :31.00                      Max.   :4918.0  
##     campaign          pdays          previous          poutcome        
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000   Length:45211      
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000   Class :character  
##  Median : 2.000   Median : -1.0   Median :  0.0000   Mode  :character  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803                     
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000                     
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000                     
##       y            
##  Length:45211      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# Check for values that equal 'unknown'
colSums(df == 'unknown')
##       age       job   marital education   default   balance   housing      loan 
##         0       288         0      1857         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##     13020         0         0         0         0         0         0     36959 
##         y 
##         0
# Check for values that equal 0
colSums(df == 0)
##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0      3514         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         3         0         0     36954         0 
##         y 
##         0
# Count unique values in categorical variables
sapply(df, function(x) length(unique(x)))
##       age       job   marital education   default   balance   housing      loan 
##        77        12         3         4         2      7168         2         2 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         3        31        12      1573        48       559        41         4 
##         y 
##         2

Data Exploration

# Compute correlation matrix for numeric features
numeric_vars <- df %>% select_if(is.numeric)
cor_matrix <- cor(numeric_vars, use = "complete.obs")

# Plot correlation heatmap
ggcorrplot(cor_matrix, method = "circle", lab = TRUE)

# Histogram for numerical features
num_cols <- colnames(numeric_vars)
par(mfrow=c(3,3))  # Arrange plots in a grid
for (col in num_cols) {
  hist(df[[col]], main=paste("Histogram of", col), col="skyblue", breaks=30)
}
par(mfrow=c(1,1))  # Reset plot layout

# Boxplots for outlier detection
for (col in num_cols) {
  boxplot(df[[col]], main=paste("Boxplot of", col), col="orange")
}

# Count plots for categorical variables
cat_vars <- df %>% select_if(is.character)

for (col in colnames(cat_vars)) {
  print(ggplot(df, aes_string(x = col)) + 
          geom_bar(fill="steelblue") + 
          theme_minimal() + 
          ggtitle(paste("Distribution of", col)) +
          theme(axis.text.x = element_text(angle = 45, hjust = 1)))
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Count of target variable
table(df$y)
## 
##    no   yes 
## 39922  5289
# Boxplot of numerical variables vs target variable
for (col in num_cols) {
  print(ggplot(df, aes_string(x = "y", y = col)) + 
          geom_boxplot(fill="lightblue") + 
          theme_minimal() + 
          ggtitle(paste("Boxplot of", col, "by y")))
}

# Categorical variables vs target variable
for (col in colnames(cat_vars)) {
  print(ggplot(df, aes_string(x = col, fill = "y")) + 
          geom_bar(position = "fill") + 
          theme_minimal() + 
          ggtitle(paste("Proportion of y by", col)) +
          theme(axis.text.x = element_text(angle = 45, hjust = 1)))
}

# Define color mapping for 'y'
colors <- ifelse(df$y == "yes", "blue", "red")  # 'yes' -> blue, 'no' -> red

# Create pairwise scatter plots with color mapping
pairs(numeric_vars, col = colors, pch = 19)

Data Transformation & Considerations

# Data Cleaning - improve data quality, address missing data, etc.
  # I don't think much is needed here. I acknowledge that there are some features with a substantial number of 'missing' data with records that read as 'unknown'. However, I think the records with 'unknown' may be valuable in increasing model accuracy as they differ from the sample mean distribution in terms of how many 'no's and 'yes's there are.
# Feature Engineering - use of business knowledge to create new features
  # I've created three features: age_group, balance_category, loan_housing_interaction. Feature engineering is a bit of a guessing game. You can use business logic but sometimes data isn't backed by business logic. Often the business logic is informed by data and unique datasets can have unique patterns that aren't well-modeled by conventional business logic.
# 1. Age Grouping (Young, Middle-aged, Senior)
df$age_group <- cut(df$age, 
                    breaks = c(18, 30, 50, 100), 
                    labels = c("Young", "Middle-aged", "Senior"), 
                    right = FALSE)

# 2. Balance Categorization (Low, Medium, High)
df$balance_category <- cut(df$balance, 
                           breaks = c(-Inf, 0, 1000, 5000, Inf), 
                           labels = c("Negative", "Low", "Medium", "High"))

# 3. Loan and Housing Interaction
df$loan_housing_interaction <- ifelse(df$housing == "no" & df$loan == "no", "No loans",
                                      ifelse(df$housing == "yes" & df$loan == "no", "Only housing loan",
                                             ifelse(df$housing == "no" & df$loan == "yes", "Only personal loan", "Both loans")))

# Check the first few rows to see the new features
head(df[, c("age_group", "balance_category", "loan_housing_interaction")])
## # A tibble: 6 × 3
##   age_group   balance_category loan_housing_interaction
##   <fct>       <fct>            <chr>                   
## 1 Senior      Medium           Only housing loan       
## 2 Middle-aged Low              Only housing loan       
## 3 Middle-aged Low              Both loans              
## 4 Middle-aged Medium           Only housing loan       
## 5 Middle-aged Low              No loans                
## 6 Middle-aged Low              Only housing loan
# Sampling Data - using sampling to resize datasets
  # Sampling is a bit tricky so I'd be tempted to build the models and evaluate them before I do any sampling. Removing records or adding records should by default not be the objective because in theory, there's info in these records but perhaps would be a necessary step depending on performance. 
# Data Transformation - regularization, normalization, handling categorical variables
# 1. Z-score transformation for 'balance' column
df$balance_zscore <- (df$balance - mean(df$balance, na.rm = TRUE)) / sd(df$balance, na.rm = TRUE)

# 2. Z-score transformation for 'duration' column
df$duration_zscore <- (df$duration - mean(df$duration, na.rm = TRUE)) / sd(df$duration, na.rm = TRUE)

# Check the first few rows to see the transformed columns
head(df)
## # A tibble: 6 × 22
##     age job        marital education default balance housing loan  contact   day
##   <dbl> <chr>      <chr>   <chr>     <chr>     <dbl> <chr>   <chr> <chr>   <dbl>
## 1    58 management married tertiary  no         2143 yes     no    unknown     5
## 2    44 technician single  secondary no           29 yes     no    unknown     5
## 3    33 entrepren… married secondary no            2 yes     yes   unknown     5
## 4    47 blue-coll… married unknown   no         1506 yes     no    unknown     5
## 5    33 unknown    single  unknown   no            1 no      no    unknown     5
## 6    35 management married tertiary  no          231 yes     no    unknown     5
## # ℹ 12 more variables: month <chr>, duration <dbl>, campaign <dbl>,
## #   pdays <dbl>, previous <dbl>, poutcome <chr>, y <chr>, age_group <fct>,
## #   balance_category <fct>, loan_housing_interaction <chr>,
## #   balance_zscore <dbl>, duration_zscore <dbl>
# Look at distribution for newly formed z-score columns
numeric_vars <- df %>% select_if(is.numeric)
num_cols <- colnames(numeric_vars)
num_cols
## [1] "age"             "balance"         "day"             "duration"       
## [5] "campaign"        "pdays"           "previous"        "balance_zscore" 
## [9] "duration_zscore"
par(mfrow=c(3,3))  # Arrange plots in a grid
for (col in num_cols) {
  hist(df[[col]], main=paste("Histogram of", col), col="skyblue", breaks=30)
}

# Imbalanced Data - reducing the imbalance between classes
  # Think this may be needed but should test model performance without removing data first, ideally!

Modeling

# Load necessary libraries
library(rpart)        # For decision tree models
library(randomForest) # For random forest models
library(caret)        # For cross-validation and model tuning
library(xgboost)      # For gradient boosting
library(dplyr)        # Data manipulation
library(tidyr)
library(data.table)
library(pROC)         # ROC and AUC metrics

# Ensure target variable 'y' is a factor for classification
df$y <- as.factor(df$y)

# -----------------------------------------------------
# PREPROCESSING & BASELINE DECISION TREE
# -----------------------------------------------------

# Convert relevant columns to factors for modeling
df_rpart <- df %>%
  mutate(across(c(job, marital, education, default, housing, loan,
                  contact, month, poutcome, age_group,
                  balance_category, loan_housing_interaction), as.factor))

# Split into train/test sets (80/20 split)
trainIndex <- sample(1:nrow(df_rpart), 0.8 * nrow(df_rpart))
trainData <- df_rpart[trainIndex, ]
testData <- df_rpart[-trainIndex, ]

# Baseline Decision Tree with all available features
# Purpose: Establish a benchmark performance for tree-based models
dt_model <- rpart(y ~ ., data = trainData, method = "class")
predictions <- predict(dt_model, newdata = testData, type = "class")
conf_matrix <- table(Predicted = predictions, Actual = testData$y)
print(conf_matrix)
##          Actual
## Predicted   no  yes
##       no  7777  708
##       yes  208  350
cat("Accuracy:", sum(diag(conf_matrix)) / sum(conf_matrix), "\n")
## Accuracy: 0.8987062
# -----------------------------------------------------
# DECISION TREE - Using Only Original Features
# -----------------------------------------------------

# Purpose: Compare how much engineered features (like z-scores and groupings)
# contribute to model performance
df_rpart_original <- df_rpart %>%
  select(-age_group, -balance_category, -loan_housing_interaction, -balance_zscore, -duration_zscore)

# Repeat train/test split
trainIndex <- sample(1:nrow(df_rpart_original), 0.8 * nrow(df_rpart_original))
trainData <- df_rpart_original[trainIndex, ]
testData <- df_rpart_original[-trainIndex, ]

# Decision tree with only original variables (no feature engineering)
dt_model_original <- rpart(y ~ ., data = trainData, method = "class")
predictions <- predict(dt_model_original, newdata = testData, type = "class")
conf_matrix <- table(Predicted = predictions, Actual = testData$y)
print(conf_matrix)
##          Actual
## Predicted   no  yes
##       no  7775  697
##       yes  203  368
cat("Accuracy:", sum(diag(conf_matrix)) / sum(conf_matrix), "\n")
## Accuracy: 0.9004755
# -----------------------------------------------------
# DECISION TREE - Pruned with max depth
# -----------------------------------------------------

# Purpose: Investigate effect of model complexity control via max depth
dt_model1 <- rpart(y ~ ., data = trainData, control = rpart.control(maxdepth = 10))
dt_pred1 <- predict(dt_model1, testData, type = "class")
conf_matrix1 <- table(Predicted = dt_pred1, Actual = testData$y)
print(conf_matrix1)
##          Actual
## Predicted   no  yes
##       no  7775  697
##       yes  203  368
cat("Decision Tree (max_depth=10) Accuracy:", round(sum(dt_pred1 == testData$y) / nrow(testData), 4), "\n")
## Decision Tree (max_depth=10) Accuracy: 0.9005
# -----------------------------------------------------
# RANDOM FOREST - Baseline
# -----------------------------------------------------

# Purpose: Improve accuracy by using ensemble of trees (bagging)
rf_model <- randomForest(y ~ ., data = trainData)
rf_predictions <- predict(rf_model, newdata = testData)
confusion_matrix <- confusionMatrix(rf_predictions, testData$y)
print(confusion_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7672  539
##        yes  306  526
##                                           
##                Accuracy : 0.9066          
##                  95% CI : (0.9004, 0.9125)
##     No Information Rate : 0.8822          
##     P-Value [Acc > NIR] : 7.260e-14       
##                                           
##                   Kappa : 0.5032          
##                                           
##  Mcnemar's Test P-Value : 1.451e-15       
##                                           
##             Sensitivity : 0.9616          
##             Specificity : 0.4939          
##          Pos Pred Value : 0.9344          
##          Neg Pred Value : 0.6322          
##              Prevalence : 0.8822          
##          Detection Rate : 0.8484          
##    Detection Prevalence : 0.9080          
##       Balanced Accuracy : 0.7278          
##                                           
##        'Positive' Class : no              
## 
cat("Random Forest Accuracy:", confusion_matrix$overall['Accuracy'], "\n")
## Random Forest Accuracy: 0.9065576
# -----------------------------------------------------
# OUTLIER REMOVAL (IQR) FOR NUMERICAL FIELDS
# -----------------------------------------------------

# Purpose: Evaluate whether removing outliers improves model robustness
remove_outliers <- function(df) {
  for (col in colnames(df)) {
    Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
    Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
    IQR_value <- IQR(df[[col]], na.rm = TRUE)
    lower_bound <- Q1 - 1.5 * IQR_value
    upper_bound <- Q3 + 1.5 * IQR_value
    df <- df %>% filter(df[[col]] >= lower_bound & df[[col]] <= upper_bound)
  }
  return(df)
}

numerical_columns <- sapply(df_rpart, is.numeric)
df_numeric <- df_rpart[, numerical_columns]
df_no_outliers <- remove_outliers(df_numeric)
df_rpart_no_outliers <- df_rpart[rownames(df_no_outliers), ]

# -----------------------------------------------------
# RANDOM FOREST - After Removing Outliers
# -----------------------------------------------------

# Purpose: Compare performance of Random Forest on cleaner data
trainIndex <- sample(1:nrow(df_rpart_no_outliers), 0.8 * nrow(df_rpart_no_outliers))
trainData <- df_rpart_no_outliers[trainIndex, ]
testData <- df_rpart_no_outliers[-trainIndex, ]

rf_model <- randomForest(y ~ ., data = trainData)
rf_predictions <- predict(rf_model, newdata = testData)
confusion_matrix <- confusionMatrix(rf_predictions, testData$y)
print(confusion_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  4740  139
##        yes   74  110
##                                          
##                Accuracy : 0.9579         
##                  95% CI : (0.952, 0.9633)
##     No Information Rate : 0.9508         
##     P-Value [Acc > NIR] : 0.009324       
##                                          
##                   Kappa : 0.4866         
##                                          
##  Mcnemar's Test P-Value : 1.159e-05      
##                                          
##             Sensitivity : 0.9846         
##             Specificity : 0.4418         
##          Pos Pred Value : 0.9715         
##          Neg Pred Value : 0.5978         
##              Prevalence : 0.9508         
##          Detection Rate : 0.9362         
##    Detection Prevalence : 0.9637         
##       Balanced Accuracy : 0.7132         
##                                          
##        'Positive' Class : no             
## 
cat("Random Forest (no outliers) Accuracy:", confusion_matrix$overall['Accuracy'], "\n")
## Random Forest (no outliers) Accuracy: 0.9579301
# -----------------------------------------------------
# RANDOM FOREST - More Trees (ntree = 5000)
# -----------------------------------------------------

# Purpose: Test if increasing number of trees improves stability/performance
rf_model <- randomForest(y ~ ., data = trainData, ntree = 5000)
rf_predictions <- predict(rf_model, newdata = testData)
confusion_matrix <- confusionMatrix(rf_predictions, testData$y)
print(confusion_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  4738  142
##        yes   76  107
##                                          
##                Accuracy : 0.9569         
##                  95% CI : (0.951, 0.9624)
##     No Information Rate : 0.9508         
##     P-Value [Acc > NIR] : 0.02207        
##                                          
##                   Kappa : 0.4734         
##                                          
##  Mcnemar's Test P-Value : 1.071e-05      
##                                          
##             Sensitivity : 0.9842         
##             Specificity : 0.4297         
##          Pos Pred Value : 0.9709         
##          Neg Pred Value : 0.5847         
##              Prevalence : 0.9508         
##          Detection Rate : 0.9358         
##    Detection Prevalence : 0.9639         
##       Balanced Accuracy : 0.7070         
##                                          
##        'Positive' Class : no             
## 
cat("Random Forest (ntree=5000) Accuracy:", confusion_matrix$overall['Accuracy'], "\n")
## Random Forest (ntree=5000) Accuracy: 0.9569425
# -----------------------------------------------------
# XGBOOST - BASIC MODEL
# -----------------------------------------------------

# Purpose: Try boosting method, which often outperforms bagging (RF)
df <- df_rpart_no_outliers %>%
  mutate(across(c(job, marital, education, default, housing, loan,
                  contact, day, month, poutcome, y), as.factor))

df_encoded <- df %>%
  select(-y) %>%
  mutate_if(is.factor, as.character) %>%
  model.matrix(~ . - 1, data = .) %>%
  data.frame()

y <- as.numeric(df$y) - 1
X <- as.matrix(df_encoded)
dtrain <- xgb.DMatrix(data = X, label = y)

params <- list(
  objective = "binary:logistic",
  eval_metric = "error",
  eta = 0.1,
  max_depth = 6
)

# Train XGBoost with basic parameters
xgb_model <- xgb.train(params = params, data = dtrain, nrounds = 100)
preds <- predict(xgb_model, X)
pred_labels <- ifelse(preds > 0.5, 1, 0)

accuracy <- mean(pred_labels == y)
cat("XGBoost (basic) Accuracy:", accuracy, "\n")
## XGBoost (basic) Accuracy: 0.9730199
conf_matrix <- confusionMatrix(as.factor(pred_labels), as.factor(y))
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 23919   531
##          1   152   713
##                                          
##                Accuracy : 0.973          
##                  95% CI : (0.9709, 0.975)
##     No Information Rate : 0.9509         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6625         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9937         
##             Specificity : 0.5732         
##          Pos Pred Value : 0.9783         
##          Neg Pred Value : 0.8243         
##              Prevalence : 0.9509         
##          Detection Rate : 0.9449         
##    Detection Prevalence : 0.9658         
##       Balanced Accuracy : 0.7834         
##                                          
##        'Positive' Class : 0              
## 
precision <- conf_matrix$byClass["Pos Pred Value"]
recall <- conf_matrix$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("XGBoost Precision:", precision, "\n")
## XGBoost Precision: 0.9782822
cat("XGBoost Recall:", recall, "\n")
## XGBoost Recall: 0.9936853
cat("XGBoost F1-Score:", f1_score, "\n")
## XGBoost F1-Score: 0.9859236
# -----------------------------------------------------
# XGBOOST - CROSS VALIDATION WITH CARET
# -----------------------------------------------------

params <- list(
  objective = "binary:logistic",  # Binary classification
  eval_metric = "logloss",        # Evaluation metric for cross-validation
  eta = 0.1,                     # Learning rate
  max_depth = 6,                  # Maximum depth of trees
  subsample = 0.8,                # Subsample ratio of the training instance
  colsample_bytree = 0.8         # Subsample ratio of columns when constructing each tree
)

# Perform cross-validation
cv_results <- xgb.cv(
  params = params,
  data = dtrain,
  nrounds = 100,                  # Number of boosting rounds
  nfold = 5,                      # Number of folds for cross-validation
  showsd = TRUE,                  # Display the standard deviation of each metric
  stratified = TRUE,              # Stratified sampling to preserve class distribution
  print_every_n = 10,             # Print results every 10 rounds
  early_stopping_rounds = 10,     # Stop early if there is no improvement
  verbose = 1                     # Display progress
)
## [1]  train-logloss:0.608837+0.000229 test-logloss:0.609610+0.000478 
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
## 
## [11] train-logloss:0.234481+0.000441 test-logloss:0.241286+0.002251 
## [21] train-logloss:0.132791+0.001382 test-logloss:0.144123+0.003650 
## [31] train-logloss:0.096338+0.000781 test-logloss:0.111732+0.003524 
## [41] train-logloss:0.082204+0.000544 test-logloss:0.100913+0.003555 
## [51] train-logloss:0.075684+0.000768 test-logloss:0.096968+0.003582 
## [61] train-logloss:0.072295+0.000892 test-logloss:0.096070+0.003748 
## Stopping. Best iteration:
## [59] train-logloss:0.072854+0.000883 test-logloss:0.096028+0.003678
# Extract the best number of boosting rounds from the cross-validation results
best_nrounds <- cv_results$best_iteration
cat("Best number of rounds: ", best_nrounds, "\n")
## Best number of rounds:  59
# Use the best number of rounds to train the final model
final_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = best_nrounds
)

# Make predictions on the training data
preds <- predict(final_model, X)

# Convert probabilities to binary outcomes (0 or 1)
pred_labels <- ifelse(preds > 0.5, 1, 0)

# Calculate accuracy
accuracy <- mean(pred_labels == y)
cat("Accuracy: ", accuracy, "\n")
## Accuracy:  0.9692672
# Calculate precision, recall, F1-score, confusion matrix, and ROC-AUC
conf_matrix <- confusionMatrix(as.factor(pred_labels), as.factor(y))
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 23881   588
##          1   190   656
##                                           
##                Accuracy : 0.9693          
##                  95% CI : (0.9671, 0.9714)
##     No Information Rate : 0.9509          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6123          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9921          
##             Specificity : 0.5273          
##          Pos Pred Value : 0.9760          
##          Neg Pred Value : 0.7754          
##              Prevalence : 0.9509          
##          Detection Rate : 0.9434          
##    Detection Prevalence : 0.9666          
##       Balanced Accuracy : 0.7597          
##                                           
##        'Positive' Class : 0               
## 
# Precision, Recall, F1-Score
precision <- conf_matrix$byClass["Pos Pred Value"]
recall <- conf_matrix$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Precision: ", precision, "\n")
## Precision:  0.9759696
cat("Recall: ", recall, "\n")
## Recall:  0.9921067
cat("F1-Score: ", f1_score, "\n")
## F1-Score:  0.983972
# ROC-AUC
roc_curve <- roc(y, preds)
auc_value <- auc(roc_curve)
cat("ROC-AUC: ", auc_value, "\n")
## ROC-AUC:  0.981444
# Load necessary libraries
library(e1071)
library(caret)
library(pROC)

# Define class weights to handle imbalance
class_weights <- c("no" = 1, "yes" = 19)

# Function to evaluate model
evaluate_model <- function(model, testData, model_name = "SVM") {
  preds <- predict(model, testData, probability = TRUE)
  probs <- attr(preds, "probabilities")[, "yes"]
  cm <- confusionMatrix(preds, testData$y)
  roc_obj <- roc(response = testData$y, predictor = probs, levels = c("no", "yes"))
  auc_val <- auc(roc_obj)
  
  f1 <- 2 * (cm$byClass["Pos Pred Value"] * cm$byClass["Sensitivity"]) /
    (cm$byClass["Pos Pred Value"] + cm$byClass["Sensitivity"])
  
  cat("\n---", model_name, "---\n")
  cat("Accuracy:", cm$overall["Accuracy"], "\n")
  cat("Precision:", cm$byClass["Pos Pred Value"], "\n")
  cat("Recall:", cm$byClass["Sensitivity"], "\n")
  cat("F1-Score:", f1, "\n")
  cat("ROC-AUC:", auc_val, "\n")
  plot(roc_obj, main = paste(model_name, "- ROC Curve"), col = "blue")
}

# Linear Kernel SVM
svm_linear <- svm(
  y ~ ., 
  data = trainData,
  type = 'C-classification',
  kernel = 'linear',
  class.weights = class_weights,
  probability = TRUE
)
evaluate_model(svm_linear, testData, "Linear SVM")
## 
## --- Linear SVM ---
## Accuracy: 0.9561525 
## Precision: 0.965154 
## Recall: 0.9896136 
## F1-Score: 0.9772308 
## ROC-AUC: 0.9639122

# RBF Kernel SVM
svm_rbf <- svm(
  y ~ ., 
  data = trainData,
  type = 'C-classification',
  kernel = 'radial',
  class.weights = class_weights,
  probability = TRUE
)
evaluate_model(svm_rbf, testData, "RBF SVM")
## 
## --- RBF SVM ---
## Accuracy: 0.9514122 
## Precision: 0.9621611 
## Recall: 0.9877441 
## F1-Score: 0.9747847 
## ROC-AUC: 0.9543942

# Polynomial Kernel SVM
svm_poly <- svm(
  y ~ ., 
  data = trainData,
  type = 'C-classification',
  kernel = 'polynomial',
  degree = 3,
  class.weights = class_weights,
  probability = TRUE
)
evaluate_model(svm_poly, testData, "Polynomial SVM")
## 
## --- Polynomial SVM ---
## Accuracy: 0.95635 
## Precision: 0.9655382 
## Recall: 0.9894059 
## F1-Score: 0.9773264 
## ROC-AUC: 0.9241607

# Radial kernel SVM with fixed hyperparameters
svm_rbf_model <- svm(
  y ~ ., 
  data = trainData,
  type = 'C-classification',
  kernel = 'radial',
  cost = 1,
  gamma = 0.1,
  class.weights = class_weights,
  probability = TRUE
)

evaluate_model(svm_rbf_model, testData, "Polynomial SVM")
## 
## --- Polynomial SVM ---
## Accuracy: 0.9484495 
## Precision: 0.9607367 
## Recall: 0.9860823 
## F1-Score: 0.9732445 
## ROC-AUC: 0.9465623

# Polynomial kernel SVM with extreme hyperparameters (opposite spectrum)
svm_poly_extreme_model <- svm(
  y ~ ., 
  data = trainData,
  type = 'C-classification',
  kernel = 'polynomial',
  degree = 5,            # Higher degree = more complex boundary
  cost = 1000,           # High cost = low tolerance for misclassification
  coef0 = 10,            # Affects influence of higher-order terms
  gamma = 10,            # High gamma = very sensitive to training points
  class.weights = NULL,  # No weighting
  probability = TRUE
)

evaluate_model(svm_poly_extreme_model, testData, "Extreme Polynomial SVM")
## 
## --- Extreme Polynomial SVM ---
## Accuracy: 0.9512147 
## Precision: 0.9511954 
## Recall: 1 
## F1-Score: 0.9749873 
## ROC-AUC: 0.7774179

# Polynomial kernel SVM with moderate hyperparameters (middle ground)
svm_poly_moderate_model <- svm(
  y ~ ., 
  data = trainData,
  type = 'C-classification',
  kernel = 'polynomial',
  degree = 3,            # Cubic decision boundary, but not overly complex
  cost = 10,             # Balanced penalty for misclassification
  coef0 = 1,             # Default/neutral influence of higher-order terms
  gamma = 0.5,           # Moderate sensitivity to data
  class.weights = class_weights,
  probability = TRUE
)

evaluate_model(svm_poly_moderate_model, testData, "Moderate Polynomial SVM")
## 
## --- Moderate Polynomial SVM ---
## Accuracy: 0.9490421 
## Precision: 0.9528827 
## Recall: 0.9956377 
## F1-Score: 0.9737911 
## ROC-AUC: 0.8136418