Predicting Term Deposit Subscriptions: A Data-Driven Analysis of Bank Marketing Strategies

Problem Statement

The core problem is to accurately predict bank client term deposit subscriptions using demographics, banking history, and macroeconomic indicators. This is a binary classification problem allowing targeted marketing, reducing wasted resources. The theoretical challenge lies in building a robust classification model despite class imbalance and multicollinearity, where traditional models may be biased, and coefficient estimates unstable.

# Task: Load the dataset
bank_data <- read.csv("bank-additional.csv", sep=";",stringsAsFactors = TRUE)

bank_data <- subset(bank_data, select = -duration)

# Convert target variable to binary factor
bank_data$y <- as.factor(ifelse(bank_data$y == "yes", 1, 0))

1. Descriptive Statistics

a. Summary Statistics

str(bank_data)

'data.frame':   4119 obs. of  20 variables:
 $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
 $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
 $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
 $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
 $ default       : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
 $ housing       : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
 $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
 $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
 $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
 $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
 $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
 $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
 $ previous      : int  0 0 0 0 0 2 0 0 1 0 ...
 $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
 $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
 $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
 $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
 $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
 $ nr.employed   : num  5099 5191 5228 5228 5196 ...
 $ y             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

summary(bank_data)

      age                 job           marital                   education   
 Min.   :18.00   admin.     :1012   divorced: 446   university.degree  :1264  
 1st Qu.:32.00   blue-collar: 884   married :2509   high.school        : 921  
 Median :38.00   technician : 691   single  :1153   basic.9y           : 574  
 Mean   :40.11   services   : 393   unknown :  11   professional.course: 535  
 3rd Qu.:47.00   management : 324                   basic.4y           : 429  
 Max.   :88.00   retired    : 166                   basic.6y           : 228  
                 (Other)    : 649                   (Other)            : 168  
    default        housing          loan           contact         month     
 no     :3315   no     :1839   no     :3349   cellular :2652   may    :1378  
 unknown: 803   unknown: 105   unknown: 105   telephone:1467   jul    : 711  
 yes    :   1   yes    :2175   yes    : 665                    aug    : 636  
                                                               jun    : 530  
                                                               nov    : 446  
                                                               apr    : 215  
                                                               (Other): 203  
 day_of_week    campaign          pdays          previous     
 fri:768     Min.   : 1.000   Min.   :  0.0   Min.   :0.0000  
 mon:855     1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000  
 thu:860     Median : 2.000   Median :999.0   Median :0.0000  
 tue:841     Mean   : 2.537   Mean   :960.4   Mean   :0.1903  
 wed:795     3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000  
             Max.   :35.000   Max.   :999.0   Max.   :6.0000  
                                                              
        poutcome     emp.var.rate      cons.price.idx  cons.conf.idx  
 failure    : 454   Min.   :-3.40000   Min.   :92.20   Min.   :-50.8  
 nonexistent:3523   1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:-42.7  
 success    : 142   Median : 1.10000   Median :93.75   Median :-41.8  
                    Mean   : 0.08497   Mean   :93.58   Mean   :-40.5  
                    3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:-36.4  
                    Max.   : 1.40000   Max.   :94.77   Max.   :-26.9  
                                                                      
   euribor3m      nr.employed   y       
 Min.   :0.635   Min.   :4964   0:3668  
 1st Qu.:1.334   1st Qu.:5099   1: 451  
 Median :4.857   Median :5191           
 Mean   :3.621   Mean   :5166           
 3rd Qu.:4.961   3rd Qu.:5228           
 Max.   :5.045   Max.   :5228

b. Missing data analysis

# Check for missing values (NA) and NaN
missing_values <- colSums(is.na(bank_data))

# Print summaries for missing, NaN, and duplicates
missing_values

           age            job        marital      education        default 
             0              0              0              0              0 
       housing           loan        contact          month    day_of_week 
             0              0              0              0              0 
      campaign          pdays       previous       poutcome   emp.var.rate 
             0              0              0              0              0 
cons.price.idx  cons.conf.idx      euribor3m    nr.employed              y 
             0              0              0              0              0

# Identify numeric and categorical variables
num_vars <- names(bank_data)[sapply(bank_data, is.numeric)]
# Select categorical variables excluding the target variable 'y'
cat_vars <- setdiff(names(bank_data)[sapply(bank_data, function(x) is.factor(x) || is.character(x))], "y")

2. Univariate Analysis

A. Target Variable Distribution

ggplot(bank_data, aes(x = y, fill = y)) + 
  geom_bar() + 
  scale_color_brewer(palette =1) +
  labs(title = "Subscription Rate for Term Deposits", x = "Subscribed", y = "Count") +
  theme_minimal()

B. Histograms with Density Plots for Numeric Variables

for (var in num_vars) {
  p <- ggplot(bank_data, aes_string(x = var)) +
    geom_histogram(aes(y = ..density..), fill = "skyblue", color = "black", bins = 30, alpha = 0.6) +
    geom_density(alpha = 0.3, fill = "red") +
    labs(title = paste("Histogram & Density of", var), x = var, y = "Density") +
    theme_minimal()
  print(p)
}

Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.

Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.

i. Binning

a. pdays

# Replace 999 with NA
bank_data$pdays[bank_data$pdays == 999] <- NA

bank_data$pdays_cat <- as.factor(ifelse(is.na(bank_data$pdays), "Not Contacted", "Contacted"))

ggplot(bank_data, aes(x = pdays_cat, fill = pdays_cat)) +
  geom_bar() +
  labs(title = "Pdays Distribution (Not Contacted vs. Contacted)", x = "Pdays", y = "Count") +
  theme_minimal()

bank_data <- subset(bank_data, select = -pdays)

b. previous

quantile(bank_data$previous, probs = c(0, 0.33, 0.67, 1))

  0%  33%  67% 100% 
   0    0    0    6

table(bank_data$previous)


   0    1    2    3    4    5    6 
3523  475   78   25   14    2    2

bank_data$previous_cat <- as.factor(ifelse(bank_data$previous == 0, "Never Contacted", 
                                  ifelse(bank_data$previous == 1, "Contacted Once", "More than Once")))

ggplot(bank_data, aes(x = previous_cat, fill = previous_cat)) +
  geom_bar() +
  labs(title = "Previous Contact Count Distribution", x = "Previous Contacts", y = "Count") +
  theme_minimal()

bank_data <- subset(bank_data, select = -previous)

C. Bar plots for categorical variables

cat_vars <- setdiff(names(bank_data)[sapply(bank_data, function(x) is.factor(x) || is.character(x))], "y")
for (var in cat_vars) {
  p <- ggplot(bank_data, aes_string(x = var, fill = var)) +
    geom_bar(alpha = 0.6, position = "dodge2") +  # Better spacing
    scale_color_viridis_b() +  # Correct Viridis scale for categorical data
    labs(title = paste("Bar Plot of", var), x = var, y = "Count") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))  # Better readability
  
  print(p)
}

education : ‘illiterate’ - ‘unknown’ default : ‘unknown’ , ‘yes’ to ‘others’

i. handling rare categories

bank_data <- bank_data |> mutate( 
  education = case_when( education %in% c("illiterate", "unknown") ~ "others", TRUE ~ education),
  default = case_when( default %in% c("unknown", "yes") ~ "others", TRUE ~ default))

3. Bivariate Analysis

A. Correlation Plot

num_vars <- names(bank_data)[sapply(bank_data, is.numeric)]
cor_matrix <- cor(bank_data[, num_vars])
diag(cor_matrix) <- 0
corrplot(cor_matrix, method = "color", type = "upper",order = "hclust", tl.col = "black", tl.srt = 45, addCoef.col = "white", number.cex = 0.7)

Inference: - emp.var.rate and euribor3m have a very strong positive correlation of 0.97, which suggests they are highly collinear. - euribor3m and nr.employed are also strongly correlated wi 0.94 - emp.var.rate and nr.employed are also strongly correlated wi 0.90. - High correlations (e.g., emp.var.rate and euribor3m) might indicate potential multicollinearity

i. VIF

vif(lm(as.numeric(as.factor(bank_data$y))  ~ age + campaign  + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, data = bank_data))

           age       campaign   emp.var.rate cons.price.idx  cons.conf.idx 
      1.013208       1.041199      31.789746       6.306744       2.504303 
     euribor3m    nr.employed 
     62.827598      30.325722

Inference: - emp.var.rate (31.78), euribor3m (62.83), and nr.employed (30.32) have extremely high VIF values, - This suggests that these variables are likely highly correlated with each other, These variables could be capturing very similar patterns - Their relation w.r.t target can be further analyzed.

B. Boxplots comparing numeric variables with respect to the Subscription Status(target variable ‘y’)

# Boxplots comparing numeric variables with respect to the target variable 'y'
for (num_var in num_vars) {
  p <- ggplot(bank_data, aes_string(x = "y", y = num_var, fill = "y")) +
    geom_boxplot(outlier.colour = "maroon", outlier.shape = 8, outlier.size = 2) +  # Customizing outliers
    labs(title = paste("Boxplot of", num_var, "by Subscription Status"), x = "Subscription Status", y = num_var) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_fill_viridis_d(alpha = 0.6) +  # Viridis color palette for target classes
    theme_minimal()
  print(p)
}

Inference :

we observe that these three collinear variables (emp.var.rate, euribor3m, and nr.employed) and cons.conf.idx, cons.price.idx each show clear differences in their distribution with respect to the target (subscription status), it tells us two important things:
Predictive Value: Each of these variables is individually informative for predicting the target.
Multicollinearity Concern: Despite their individual predictive power, their high collinearity (as evidenced by the high VIFs) can cause issues in regression-based models

C. Categorical vs. Target: Bar plots for each categorical variable

# Regular Stacked Bar Plot for categorical variables vs target 'y'
for (cat_var in cat_vars) {
    cat("Chi-Square test for", cat_var, "\n")
  table_data <- table(bank_data[[cat_var]], bank_data$y)
  print(chisq.test(table_data))
  cat("\n")
  p <- ggplot(bank_data, aes_string(x = cat_var, fill = "y")) +
    geom_bar(position = "fill") +  # Stacked bar plot with raw counts
    labs(title = paste("Proportion of", cat_var, "by Subscription Status"), 
         x = cat_var, y = "Proportion") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_fill_viridis_d(alpha = 0.6) +  # Use a color palette for better visualization
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) 
  
  print(p)
}

Chi-Square test for job

Warning in chisq.test(table_data): Chi-squared approximation may be incorrect


    Pearson's Chi-squared test

data:  table_data
X-squared = 69.979, df = 11, p-value = 1.233e-10

Chi-Square test for marital

Warning in chisq.test(table_data): Chi-squared approximation may be incorrect


    Pearson's Chi-squared test

data:  table_data
X-squared = 10.286, df = 3, p-value = 0.01629

Chi-Square test for education 

    Pearson's Chi-squared test

data:  table_data
X-squared = 22.045, df = 6, p-value = 0.001188

Chi-Square test for default 

    Pearson's Chi-squared test with Yates' continuity correction

data:  table_data
X-squared = 23.533, df = 1, p-value = 1.228e-06

Chi-Square test for housing 

    Pearson's Chi-squared test

data:  table_data
X-squared = 0.62738, df = 2, p-value = 0.7307

Chi-Square test for loan 

    Pearson's Chi-squared test

data:  table_data
X-squared = 1.1297, df = 2, p-value = 0.5684

Chi-Square test for contact 

    Pearson's Chi-squared test with Yates' continuity correction

data:  table_data
X-squared = 76.846, df = 1, p-value < 2.2e-16

Chi-Square test for month

Warning in chisq.test(table_data): Chi-squared approximation may be incorrect


    Pearson's Chi-squared test

data:  table_data
X-squared = 299.79, df = 9, p-value < 2.2e-16

Chi-Square test for day_of_week 

    Pearson's Chi-squared test

data:  table_data
X-squared = 0.51218, df = 4, p-value = 0.9723

Chi-Square test for poutcome 

    Pearson's Chi-squared test

data:  table_data
X-squared = 454.49, df = 2, p-value < 2.2e-16

Chi-Square test for pdays_cat 

    Pearson's Chi-squared test with Yates' continuity correction

data:  table_data
X-squared = 448.22, df = 1, p-value < 2.2e-16

Chi-Square test for previous_cat 

    Pearson's Chi-squared test

data:  table_data
X-squared = 258.52, df = 2, p-value < 2.2e-16

Interpretation:

These variables have a p-value > 0.05, meaning they may not be strong predictors: - Housing (𝑝= 0.7307) - Loan (𝑝= 0.5684) - Day of Week (𝑝 = 0.9723) Based on chi-square test, These variables do not significantly impact y, same can be visualized in proportion based stacked bar plot as well –> we may consider removing them from the model to reduce noise.

i. Removing less significant features

bank_data <- subset(bank_data, select = -c(housing,loan,day_of_week))

D. Plots to analyze highly collinear variables

i. Plot for Relationship between emp.var.rate, nr.employed, and euribor3m

ggplot(bank_data, aes(x = emp.var.rate, y = nr.employed, color = euribor3m)) +
  geom_point(alpha = 0.7, size = 3) +
  scale_color_gradient(low = "blue", high = "red") +
  labs(title = "Relationship between emp.var.rate, nr.employed, and euribor3m",
       x = "emp.var.rate", y = "nr.employed", color = "euribor3m") +
  theme_minimal()

inference : These visual cues suggest that all three variables move together

ii. Combination variable creation for the 3 collinear variables

# Equal-weight combination of the three scaled variables
bank_data$comb_var <- scale(bank_data$emp.var.rate) + scale(bank_data$euribor3m) + scale(bank_data$nr.employed)

# Remove the original variables after creating the linear combination
#bank_data_filtered <- subset(bank_data, select = -c(emp_var_rate_bin,campaign_bin,euribor3m_bin,comb_var))
bank_data_filtered <- subset(bank_data, select = -c(emp.var.rate, euribor3m, nr.employed))

4. Data Modeling

i. Train Test Split

bank_data_filtered$y <- as.factor(bank_data_filtered$y)
# Split the dataset into training and testing sets
set.seed(101)
trainIndex <- createDataPartition(bank_data_filtered$y, p = 0.8, list = FALSE, times = 1)
train_data <- bank_data_filtered[trainIndex, ]
test_data <- bank_data_filtered[-trainIndex, ]

ii. Outlier Treatment using IQR method

numeric_vars <- names(bank_data_filtered)[sapply(bank_data_filtered, is.numeric)]
for (var in numeric_vars) {
  Q1 <- quantile(train_data[[var]], 0.25, na.rm = TRUE)
  Q3 <- quantile(train_data[[var]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  train_data[[var]] <- pmin(pmax(train_data[[var]], lower_bound), upper_bound)
  test_data[[var]] <- pmin(pmax(test_data[[var]], lower_bound), upper_bound)
}

iii. Feature scaling

# Fit the pre-processing (scaling) on the train data 
preprocess_params <- preProcess(train_data[, numeric_vars], method = c("center", "scale"))
train_data[, numeric_vars] <- predict(preprocess_params, newdata = train_data[, numeric_vars])
test_data[, numeric_vars] <- predict(preprocess_params, newdata = test_data[, numeric_vars])

train_data$y <- factor(train_data$y, levels = c(0, 1), labels = c("No", "Yes"))
test_data$y <- factor(test_data$y, levels = c(0, 1), labels = c("No", "Yes"))

baseline

# Train Logistic Regression (baseline model)
logistic_model <- train(y ~ ., data = train_data, method = "glm", family = "binomial", trControl = trainControl(method = "cv", number = 5, verboseIter = FALSE))
# Train LDA (baseline model)
lda_model <- train(y ~ ., data = train_data, method = "lda", trControl = trainControl(method = "cv", number = 5, verboseIter = FALSE))

Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear

# Train Random Forest (baseline model)
rf_model <- train(y ~ ., data = train_data, method = "rf", trControl = trainControl(method = "cv", number = 5, verboseIter = FALSE))

# Predictions for test_data
pred_logit <- predict(logistic_model, newdata = test_data)
pred_lda <- predict(lda_model, newdata = test_data)
pred_rf <- predict(rf_model, newdata = test_data)

# Confusion matrices
cm_logit <- confusionMatrix(pred_logit, test_data$y)
cm_lda <- confusionMatrix(pred_lda, test_data$y)
cm_rf <- confusionMatrix(pred_rf, test_data$y)

# Print classification metrics (Precision, Recall, F1)
cat("Logistic Regression Metrics:\n")

Logistic Regression Metrics:

print(cm_logit$byClass)

         Sensitivity          Specificity       Pos Pred Value 
           0.9809004            0.2666667            0.9159236 
      Neg Pred Value            Precision               Recall 
           0.6315789            0.9159236            0.9809004 
                  F1           Prevalence       Detection Rate 
           0.9472991            0.8906440            0.8736330 
Detection Prevalence    Balanced Accuracy 
           0.9538275            0.6237835

cat("\nLDA Metrics:\n")


LDA Metrics:

print(cm_lda$byClass)

         Sensitivity          Specificity       Pos Pred Value 
           0.9631651            0.3888889            0.9277267 
      Neg Pred Value            Precision               Recall 
           0.5645161            0.9277267            0.9631651 
                  F1           Prevalence       Detection Rate 
           0.9451138            0.8906440            0.8578372 
Detection Prevalence    Balanced Accuracy 
           0.9246659            0.6760270

cat("\nRandom Forest Metrics:\n")


Random Forest Metrics:

print(cm_rf$byClass)

         Sensitivity          Specificity       Pos Pred Value 
           0.9918145            0.2000000            0.9098874 
      Neg Pred Value            Precision               Recall 
           0.7500000            0.9098874            0.9918145 
                  F1           Prevalence       Detection Rate 
           0.9490862            0.8906440            0.8833536 
Detection Prevalence    Balanced Accuracy 
           0.9708384            0.5959072

# AUC-ROC Calculation for weighted models
logit_probs <- predict(logistic_model, newdata = test_data, type = "prob")
lda_probs <- predict(lda_model, newdata = test_data, type = "prob")
rf_probs <- predict(rf_model, newdata = test_data, type = "prob")

# Calculate AUC for weighted models
auc_logit <- auc(roc(test_data$y, logit_probs$Yes))

Setting levels: control = No, case = Yes

Setting direction: controls < cases

auc_lda <- auc(roc(test_data$y, lda_probs$Yes))

Setting levels: control = No, case = Yes
Setting direction: controls < cases

auc_rf <- auc(roc(test_data$y, rf_probs$Yes))

Setting levels: control = No, case = Yes
Setting direction: controls < cases

# Print AUC for weighted models
cat("AUC for Logistic Regression Weighted:", auc_logit, "\n")

AUC for Logistic Regression Weighted: 0.7857663

cat("AUC for LDA Weighted:", auc_lda, "\n")

AUC for LDA Weighted: 0.788328

cat("AUC for Random Forest Weighted:", auc_rf, "\n")

AUC for Random Forest Weighted: 0.7568971

custom weights

train_control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary, verboseIter = FALSE)

# Compute class distribution
class_counts <- table(train_data$y)
total_samples <- sum(class_counts)

# Compute class weights (inverse of class frequencies)
class_weights <- total_samples / (length(class_counts) * class_counts)

# Train Logistic Regression with weights
logistic_weighted <- train(y ~ ., data = train_data, method = "glm", family = "binomial", 
                           trControl = train_control, weights = class_weights[train_data$y])

Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.

Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!

# Train LDA with weights
lda_weighted <- train(y ~ ., data = train_data, method = "lda", 
                      trControl = train_control, weights = class_weights[train_data$y])

Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.

Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear

# Train Random Forest with weights
rf_weighted <- train(y ~ ., data = train_data, method = "rf", 
                     trControl = train_control, weights = class_weights[train_data$y])

Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.

# Predictions for weighted models
pred_logit_weighted <- predict(logistic_weighted, newdata = test_data)
pred_lda_weighted <- predict(lda_weighted, newdata = test_data)
pred_rf_weighted <- predict(rf_weighted, newdata = test_data)

# Get confusion matrices
cm_logit_weighted <- confusionMatrix(pred_logit_weighted, test_data$y)
cm_lda_weighted <- confusionMatrix(pred_lda_weighted, test_data$y)
cm_rf_weighted <- confusionMatrix(pred_rf_weighted, test_data$y)

# Print Classification Report (Precision, Recall, F1)
print(cm_logit_weighted$byClass)

         Sensitivity          Specificity       Pos Pred Value 
           0.8376535            0.6222222            0.9475309 
      Neg Pred Value            Precision               Recall 
           0.3200000            0.9475309            0.8376535 
                  F1           Prevalence       Detection Rate 
           0.8892107            0.8906440            0.7460510 
Detection Prevalence    Balanced Accuracy 
           0.7873633            0.7299379

print(cm_lda_weighted$byClass)

         Sensitivity          Specificity       Pos Pred Value 
           0.9631651            0.3888889            0.9277267 
      Neg Pred Value            Precision               Recall 
           0.5645161            0.9277267            0.9631651 
                  F1           Prevalence       Detection Rate 
           0.9451138            0.8906440            0.8578372 
Detection Prevalence    Balanced Accuracy 
           0.9246659            0.6760270

print(cm_rf_weighted$byClass)

         Sensitivity          Specificity       Pos Pred Value 
           0.9904502            0.2000000            0.9097744 
      Neg Pred Value            Precision               Recall 
           0.7200000            0.9097744            0.9904502 
                  F1           Prevalence       Detection Rate 
           0.9483997            0.8906440            0.8821385 
Detection Prevalence    Balanced Accuracy 
           0.9696233            0.5952251

# AUC-ROC Calculation for weighted models
logit_weighted_probs <- predict(logistic_weighted, newdata = test_data, type = "prob")
lda_weighted_probs <- predict(lda_weighted, newdata = test_data, type = "prob")
rf_weighted_probs <- predict(rf_weighted, newdata = test_data, type = "prob")

# Calculate AUC for weighted models
auc_logit_weighted <- auc(roc(test_data$y, logit_weighted_probs$Yes))

Setting levels: control = No, case = Yes

Setting direction: controls < cases

auc_lda_weighted <- auc(roc(test_data$y, lda_weighted_probs$Yes))

Setting levels: control = No, case = Yes
Setting direction: controls < cases

auc_rf_weighted <- auc(roc(test_data$y, rf_weighted_probs$Yes))

Setting levels: control = No, case = Yes
Setting direction: controls < cases

# Print AUC for weighted models
cat("AUC for Logistic Regression Weighted:", auc_logit_weighted, "\n")

AUC for Logistic Regression Weighted: 0.7800364

cat("AUC for LDA Weighted:", auc_lda_weighted, "\n")

AUC for LDA Weighted: 0.788328

cat("AUC for Random Forest Weighted:", auc_rf_weighted, "\n")

AUC for Random Forest Weighted: 0.7659997

upsampling

cv_control_upsampled <- trainControl(method = "cv",number = 5,classProbs = TRUE, summaryFunction = twoClassSummary,sampling = "up",  verboseIter = FALSE, savePredictions = "final")
model_logit_upsampled <- train(y ~ ., data = train_data,method = "glm",family = binomial,trControl = cv_control_upsampled,metric = "ROC")  
model_lda_upsampled <- train(y ~ ., data = train_data,method = "lda",family = binomial,trControl = cv_control_upsampled,metric = "ROC")

Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear

model_rf_upsampled <- train(y ~ ., data = train_data,method = "rf",family = binomial,trControl = cv_control_upsampled,metric ="ROC")

# Predictions on test sample
pred_logit_upsampled <- predict(model_logit_upsampled, newdata = test_data)
pred_lda_upsampled <- predict(model_lda_upsampled, newdata = test_data)
pred_rf_upsampled <- predict(model_rf_upsampled, newdata = test_data)

# Get confusion matrices
cm_logit_upsampled <- confusionMatrix(pred_logit_upsampled, test_data$y)
cm_lda_upsampled <- confusionMatrix(pred_lda_upsampled, test_data$y)
cm_rf_upsampled <- confusionMatrix(pred_rf_upsampled, test_data$y)

# Print Classification Report (Precision, Recall, F1)
print(cm_logit_upsampled$byClass)

         Sensitivity          Specificity       Pos Pred Value 
           0.8485675            0.6222222            0.9481707 
      Neg Pred Value            Precision               Recall 
           0.3353293            0.9481707            0.8485675 
                  F1           Prevalence       Detection Rate 
           0.8956084            0.8906440            0.7557716 
Detection Prevalence    Balanced Accuracy 
           0.7970838            0.7353949

print(cm_lda_upsampled$byClass)

         Sensitivity          Specificity       Pos Pred Value 
           0.8321965            0.6333333            0.9486781 
      Neg Pred Value            Precision               Recall 
           0.3166667            0.9486781            0.8321965 
                  F1           Prevalence       Detection Rate 
           0.8866279            0.8906440            0.7411908 
Detection Prevalence    Balanced Accuracy 
           0.7812880            0.7327649

print(cm_rf_upsampled$byClass)

         Sensitivity          Specificity       Pos Pred Value 
           0.9017735            0.5555556            0.9429387 
      Neg Pred Value            Precision               Recall 
           0.4098361            0.9429387            0.9017735 
                  F1           Prevalence       Detection Rate 
           0.9218968            0.8906440            0.8031592 
Detection Prevalence    Balanced Accuracy 
           0.8517618            0.7286645

# AUC-ROC Calculation
logit_probs_upsampled <- predict(model_logit_upsampled, newdata = test_data, type = "prob")
lda_probs_upsampled <- predict(model_lda_upsampled, newdata = test_data, type = "prob")
rf_probs_upsampled <- predict(model_rf_upsampled, newdata = test_data, type = "prob")

# Calculate AUC
auc_logit_upsampled <- auc(roc(test_data$y, logit_probs_upsampled$Yes))

Setting levels: control = No, case = Yes

Setting direction: controls < cases

auc_lda_upsampled <- auc(roc(test_data$y, lda_probs_upsampled$Yes))

Setting levels: control = No, case = Yes
Setting direction: controls < cases

auc_rf_upsampled <- auc(roc(test_data$y, rf_probs_upsampled$Yes))

Setting levels: control = No, case = Yes
Setting direction: controls < cases

# Print AUC for each model
cat("AUC for Logistic Regression:", auc_logit_upsampled, "\n")

AUC for Logistic Regression: 0.7760952

cat("AUC for LDA:", auc_lda_upsampled, "\n")

AUC for LDA: 0.7829923

cat("AUC for Random Forest:", auc_rf_upsampled, "\n")

AUC for Random Forest: 0.7801804

Decision Thresholding

library(pROC)
library(caret)
library(ggplot2)

# Function to calculate F1-score from confusion matrix
get_f1_score <- function(preds, actuals) {
  cm <- confusionMatrix(as.factor(preds), as.factor(actuals))
  return(cm$byClass["F1"])
}

# Calculate ROC and AUC for each model
roc_logit <- roc(test_data$y, logit_probs_upsampled$Yes)

Setting levels: control = No, case = Yes

Setting direction: controls < cases

roc_lda <- roc(test_data$y, lda_probs_upsampled$Yes)

Setting levels: control = No, case = Yes
Setting direction: controls < cases

roc_rf <- roc(test_data$y, rf_probs_upsampled$Yes)

Setting levels: control = No, case = Yes
Setting direction: controls < cases

# Sensitivity and Specificity for each model at different thresholds
sens_spec_logit <- data.frame(
  threshold = roc_logit$thresholds,
  sensitivity = roc_logit$sensitivities,
  specificity = roc_logit$specificities
)

sens_spec_lda <- data.frame(
  threshold = roc_lda$thresholds,
  sensitivity = roc_lda$sensitivities,
  specificity = roc_lda$specificities
)

sens_spec_rf <- data.frame(
  threshold = roc_rf$thresholds,
  sensitivity = roc_rf$sensitivities,
  specificity = roc_rf$specificities
)

# Calculate the difference between sensitivity and specificity at each threshold
diff_logit <- abs(sens_spec_logit$sensitivity - sens_spec_logit$specificity)
diff_lda <- abs(sens_spec_lda$sensitivity - sens_spec_lda$specificity)
diff_rf <- abs(sens_spec_rf$sensitivity - sens_spec_rf$specificity)

# Find the threshold where this difference is minimized (best balance)
optimal_logit_threshold <- sens_spec_logit$threshold[which.min(diff_logit)]
optimal_lda_threshold <- sens_spec_lda$threshold[which.min(diff_lda)]
optimal_rf_threshold <- sens_spec_rf$threshold[which.min(diff_rf)]

cat("Optimal Threshold for Logistic Regression (Balance Sensitivity-Specificity):", optimal_logit_threshold, "\n")

Optimal Threshold for Logistic Regression (Balance Sensitivity-Specificity): 0.3840407

cat("Optimal Threshold for LDA (Balance Sensitivity-Specificity):", optimal_lda_threshold, "\n")

Optimal Threshold for LDA (Balance Sensitivity-Specificity): 0.3662219

cat("Optimal Threshold for Random Forest (Balance Sensitivity-Specificity):", optimal_rf_threshold, "\n")

Optimal Threshold for Random Forest (Balance Sensitivity-Specificity): 0.222

# Plot ROC curves and Sensitivity vs 1-Specificity curves together
ggplot() +
  # ROC curve for Logistic Regression
  geom_line(data = data.frame(fpr = 1 - roc_logit$specificities, tpr = roc_logit$sensitivities),
            aes(x = fpr, y = tpr), color = "blue", size = 1.2) +
  # ROC curve for LDA
  geom_line(data = data.frame(fpr = 1 - roc_lda$specificities, tpr = roc_lda$sensitivities),
            aes(x = fpr, y = tpr), color = "red", size = 1.2) +
  # ROC curve for Random Forest
  geom_line(data = data.frame(fpr = 1 - roc_rf$specificities, tpr = roc_rf$sensitivities),
            aes(x = fpr, y = tpr), color = "green", size = 1.2) +
  
  # Sensitivity vs 1-Specificity curve for Logistic Regression
  geom_line(data = sens_spec_logit, aes(x = 1 - specificity, y = sensitivity), linetype = "dashed", color = "blue") +
  # Sensitivity vs 1-Specificity curve for LDA
  geom_line(data = sens_spec_lda, aes(x = 1 - specificity, y = sensitivity), linetype = "dashed", color = "red") +
  # Sensitivity vs 1-Specificity curve for Random Forest
  geom_line(data = sens_spec_rf, aes(x = 1 - specificity, y = sensitivity), linetype = "dashed", color = "green") +
  
  # Add optimal threshold lines (dotted vertical lines)
  geom_vline(xintercept = 1 - optimal_logit_threshold, color = "blue", linetype = "dotted", size = 1) +
  geom_vline(xintercept = 1 - optimal_lda_threshold, color = "red", linetype = "dotted", size = 1) +
  geom_vline(xintercept = 1 - optimal_rf_threshold, color = "green", linetype = "dotted", size = 1) +
  
  # Add labels and title
  labs(title = "ROC and Sensitivity-Specificity Curves with Optimal Thresholds (Balance)",
       x = "False Positive Rate (1 - Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  
  # Adjust axis limits for visibility
  xlim(0, 1) + 
  ylim(0, 1) +
  
  theme_minimal() +
  theme(legend.title = element_blank()) +
  scale_color_manual(values = c("blue", "red", "green"), labels = c("Logistic Regression", "LDA", "Random Forest")) +
  scale_linetype_manual(values = c("solid", "solid", "solid"), labels = c("ROC", "ROC", "ROC")) +
  guides(color = guide_legend(title = "Model"), linetype = guide_legend(title = "Curve"))

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

# Function to calculate performance metrics at a specific threshold
calculate_metrics_at_threshold <- function(pred_probs, actuals, threshold) {
  # Convert probabilities to class predictions based on the threshold
  preds <- ifelse(pred_probs > threshold, "Yes", "No")
  
  # Create confusion matrix
  cm <- confusionMatrix(as.factor(preds), as.factor(actuals))
  
  # Extract relevant metrics from the confusion matrix
  metrics <- cm$byClass
  metrics["Threshold"] <- threshold
  return(metrics)
}

# Calculate metrics for Logistic Regression at optimal threshold
logit_metrics <- calculate_metrics_at_threshold(logit_probs_upsampled$Yes, test_data$y, optimal_logit_threshold)

# Calculate metrics for LDA at optimal threshold
lda_metrics <- calculate_metrics_at_threshold(lda_probs_upsampled$Yes, test_data$y, optimal_lda_threshold)

# Calculate metrics for Random Forest at optimal threshold
rf_metrics <- calculate_metrics_at_threshold(rf_probs_upsampled$Yes, test_data$y, optimal_rf_threshold)

# Combine all metrics into a data frame
all_metrics <- rbind(logit_metrics, lda_metrics, rf_metrics)
rownames(all_metrics) <- c("Logistic Regression", "LDA", "Random Forest")

# Print the metrics
print(all_metrics)

                    Sensitivity Specificity Pos Pred Value Neg Pred Value
Logistic Regression   0.6780355   0.6777778      0.9448669      0.2053872
LDA                   0.6998636   0.7000000      0.9500000      0.2226148
Random Forest         0.7107776   0.7111111      0.9524680      0.2318841
                    Precision    Recall        F1 Prevalence Detection Rate
Logistic Regression 0.9448669 0.6780355 0.7895155   0.890644      0.6038882
LDA                 0.9500000 0.6998636 0.8059701   0.890644      0.6233293
Random Forest       0.9524680 0.7107776 0.8140625   0.890644      0.6330498
                    Detection Prevalence Balanced Accuracy Threshold
Logistic Regression            0.6391252         0.6779066 0.3840407
LDA                            0.6561361         0.6999318 0.3662219
Random Forest                  0.6646416         0.7109444 0.2220000

Random Forest has the best overall performance in terms of sensitivity, specificity, precision, and F1 score at its optimal threshold.