# Task: Load the dataset
bank_data <- read.csv("bank-additional.csv", sep=";",stringsAsFactors = TRUE)Predicting Term Deposit Subscriptions: A Data-Driven Analysis of Bank Marketing Strategies
Problem Statement
The core problem is to accurately predict bank client term deposit subscriptions using demographics, banking history, and macroeconomic indicators. This is a binary classification problem allowing targeted marketing, reducing wasted resources. The theoretical challenge lies in building a robust classification model despite class imbalance and multicollinearity, where traditional models may be biased, and coefficient estimates unstable.
bank_data <- subset(bank_data, select = -duration)
# Convert target variable to binary factor
bank_data$y <- as.factor(ifelse(bank_data$y == "yes", 1, 0))1. Descriptive Statistics
a. Summary Statistics
str(bank_data)'data.frame': 4119 obs. of 20 variables:
$ age : int 30 39 25 38 47 32 32 41 31 35 ...
$ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
$ marital : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
$ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
$ default : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
$ housing : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
$ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
$ contact : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
$ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
$ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
$ campaign : int 2 4 1 3 1 3 4 2 1 1 ...
$ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
$ previous : int 0 0 0 0 0 2 0 0 1 0 ...
$ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
$ emp.var.rate : num -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
$ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
$ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
$ euribor3m : num 1.31 4.86 4.96 4.96 4.19 ...
$ nr.employed : num 5099 5191 5228 5228 5196 ...
$ y : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
summary(bank_data) age job marital education
Min. :18.00 admin. :1012 divorced: 446 university.degree :1264
1st Qu.:32.00 blue-collar: 884 married :2509 high.school : 921
Median :38.00 technician : 691 single :1153 basic.9y : 574
Mean :40.11 services : 393 unknown : 11 professional.course: 535
3rd Qu.:47.00 management : 324 basic.4y : 429
Max. :88.00 retired : 166 basic.6y : 228
(Other) : 649 (Other) : 168
default housing loan contact month
no :3315 no :1839 no :3349 cellular :2652 may :1378
unknown: 803 unknown: 105 unknown: 105 telephone:1467 jul : 711
yes : 1 yes :2175 yes : 665 aug : 636
jun : 530
nov : 446
apr : 215
(Other): 203
day_of_week campaign pdays previous
fri:768 Min. : 1.000 Min. : 0.0 Min. :0.0000
mon:855 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000
thu:860 Median : 2.000 Median :999.0 Median :0.0000
tue:841 Mean : 2.537 Mean :960.4 Mean :0.1903
wed:795 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
Max. :35.000 Max. :999.0 Max. :6.0000
poutcome emp.var.rate cons.price.idx cons.conf.idx
failure : 454 Min. :-3.40000 Min. :92.20 Min. :-50.8
nonexistent:3523 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7
success : 142 Median : 1.10000 Median :93.75 Median :-41.8
Mean : 0.08497 Mean :93.58 Mean :-40.5
3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4
Max. : 1.40000 Max. :94.77 Max. :-26.9
euribor3m nr.employed y
Min. :0.635 Min. :4964 0:3668
1st Qu.:1.334 1st Qu.:5099 1: 451
Median :4.857 Median :5191
Mean :3.621 Mean :5166
3rd Qu.:4.961 3rd Qu.:5228
Max. :5.045 Max. :5228
b. Missing data analysis
# Check for missing values (NA) and NaN
missing_values <- colSums(is.na(bank_data))
# Print summaries for missing, NaN, and duplicates
missing_values age job marital education default
0 0 0 0 0
housing loan contact month day_of_week
0 0 0 0 0
campaign pdays previous poutcome emp.var.rate
0 0 0 0 0
cons.price.idx cons.conf.idx euribor3m nr.employed y
0 0 0 0 0
# Identify numeric and categorical variables
num_vars <- names(bank_data)[sapply(bank_data, is.numeric)]
# Select categorical variables excluding the target variable 'y'
cat_vars <- setdiff(names(bank_data)[sapply(bank_data, function(x) is.factor(x) || is.character(x))], "y")2. Univariate Analysis
A. Target Variable Distribution
ggplot(bank_data, aes(x = y, fill = y)) +
geom_bar() +
scale_color_brewer(palette =1) +
labs(title = "Subscription Rate for Term Deposits", x = "Subscribed", y = "Count") +
theme_minimal()B. Histograms with Density Plots for Numeric Variables
for (var in num_vars) {
p <- ggplot(bank_data, aes_string(x = var)) +
geom_histogram(aes(y = ..density..), fill = "skyblue", color = "black", bins = 30, alpha = 0.6) +
geom_density(alpha = 0.3, fill = "red") +
labs(title = paste("Histogram & Density of", var), x = var, y = "Density") +
theme_minimal()
print(p)
}Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.
i. Binning
a. pdays
# Replace 999 with NA
bank_data$pdays[bank_data$pdays == 999] <- NA
bank_data$pdays_cat <- as.factor(ifelse(is.na(bank_data$pdays), "Not Contacted", "Contacted"))
ggplot(bank_data, aes(x = pdays_cat, fill = pdays_cat)) +
geom_bar() +
labs(title = "Pdays Distribution (Not Contacted vs. Contacted)", x = "Pdays", y = "Count") +
theme_minimal()bank_data <- subset(bank_data, select = -pdays)b. previous
quantile(bank_data$previous, probs = c(0, 0.33, 0.67, 1)) 0% 33% 67% 100%
0 0 0 6
table(bank_data$previous)
0 1 2 3 4 5 6
3523 475 78 25 14 2 2
bank_data$previous_cat <- as.factor(ifelse(bank_data$previous == 0, "Never Contacted",
ifelse(bank_data$previous == 1, "Contacted Once", "More than Once")))
ggplot(bank_data, aes(x = previous_cat, fill = previous_cat)) +
geom_bar() +
labs(title = "Previous Contact Count Distribution", x = "Previous Contacts", y = "Count") +
theme_minimal()bank_data <- subset(bank_data, select = -previous)C. Bar plots for categorical variables
cat_vars <- setdiff(names(bank_data)[sapply(bank_data, function(x) is.factor(x) || is.character(x))], "y")
for (var in cat_vars) {
p <- ggplot(bank_data, aes_string(x = var, fill = var)) +
geom_bar(alpha = 0.6, position = "dodge2") + # Better spacing
scale_color_viridis_b() + # Correct Viridis scale for categorical data
labs(title = paste("Bar Plot of", var), x = var, y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) # Better readability
print(p)
}education : ‘illiterate’ - ‘unknown’ default : ‘unknown’ , ‘yes’ to ‘others’
i. handling rare categories
bank_data <- bank_data |> mutate(
education = case_when( education %in% c("illiterate", "unknown") ~ "others", TRUE ~ education),
default = case_when( default %in% c("unknown", "yes") ~ "others", TRUE ~ default))3. Bivariate Analysis
A. Correlation Plot
num_vars <- names(bank_data)[sapply(bank_data, is.numeric)]
cor_matrix <- cor(bank_data[, num_vars])
diag(cor_matrix) <- 0
corrplot(cor_matrix, method = "color", type = "upper",order = "hclust", tl.col = "black", tl.srt = 45, addCoef.col = "white", number.cex = 0.7)Inference: - emp.var.rate and euribor3m have a very strong positive correlation of 0.97, which suggests they are highly collinear. - euribor3m and nr.employed are also strongly correlated wi 0.94 - emp.var.rate and nr.employed are also strongly correlated wi 0.90. - High correlations (e.g., emp.var.rate and euribor3m) might indicate potential multicollinearity
i. VIF
vif(lm(as.numeric(as.factor(bank_data$y)) ~ age + campaign + emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + nr.employed, data = bank_data)) age campaign emp.var.rate cons.price.idx cons.conf.idx
1.013208 1.041199 31.789746 6.306744 2.504303
euribor3m nr.employed
62.827598 30.325722
Inference: - emp.var.rate (31.78), euribor3m (62.83), and nr.employed (30.32) have extremely high VIF values, - This suggests that these variables are likely highly correlated with each other, These variables could be capturing very similar patterns - Their relation w.r.t target can be further analyzed.
B. Boxplots comparing numeric variables with respect to the Subscription Status(target variable ‘y’)
# Boxplots comparing numeric variables with respect to the target variable 'y'
for (num_var in num_vars) {
p <- ggplot(bank_data, aes_string(x = "y", y = num_var, fill = "y")) +
geom_boxplot(outlier.colour = "maroon", outlier.shape = 8, outlier.size = 2) + # Customizing outliers
labs(title = paste("Boxplot of", num_var, "by Subscription Status"), x = "Subscription Status", y = num_var) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_viridis_d(alpha = 0.6) + # Viridis color palette for target classes
theme_minimal()
print(p)
}Inference :
we observe that these three collinear variables (emp.var.rate, euribor3m, and nr.employed) and cons.conf.idx, cons.price.idx each show clear differences in their distribution with respect to the target (subscription status), it tells us two important things:
Predictive Value: Each of these variables is individually informative for predicting the target.
Multicollinearity Concern: Despite their individual predictive power, their high collinearity (as evidenced by the high VIFs) can cause issues in regression-based models
C. Categorical vs. Target: Bar plots for each categorical variable
# Regular Stacked Bar Plot for categorical variables vs target 'y'
for (cat_var in cat_vars) {
cat("Chi-Square test for", cat_var, "\n")
table_data <- table(bank_data[[cat_var]], bank_data$y)
print(chisq.test(table_data))
cat("\n")
p <- ggplot(bank_data, aes_string(x = cat_var, fill = "y")) +
geom_bar(position = "fill") + # Stacked bar plot with raw counts
labs(title = paste("Proportion of", cat_var, "by Subscription Status"),
x = cat_var, y = "Proportion") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_viridis_d(alpha = 0.6) + # Use a color palette for better visualization
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
print(p)
}Chi-Square test for job
Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: table_data
X-squared = 69.979, df = 11, p-value = 1.233e-10
Chi-Square test for marital
Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: table_data
X-squared = 10.286, df = 3, p-value = 0.01629
Chi-Square test for education
Pearson's Chi-squared test
data: table_data
X-squared = 22.045, df = 6, p-value = 0.001188
Chi-Square test for default
Pearson's Chi-squared test with Yates' continuity correction
data: table_data
X-squared = 23.533, df = 1, p-value = 1.228e-06
Chi-Square test for housing
Pearson's Chi-squared test
data: table_data
X-squared = 0.62738, df = 2, p-value = 0.7307
Chi-Square test for loan
Pearson's Chi-squared test
data: table_data
X-squared = 1.1297, df = 2, p-value = 0.5684
Chi-Square test for contact
Pearson's Chi-squared test with Yates' continuity correction
data: table_data
X-squared = 76.846, df = 1, p-value < 2.2e-16
Chi-Square test for month
Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: table_data
X-squared = 299.79, df = 9, p-value < 2.2e-16
Chi-Square test for day_of_week
Pearson's Chi-squared test
data: table_data
X-squared = 0.51218, df = 4, p-value = 0.9723
Chi-Square test for poutcome
Pearson's Chi-squared test
data: table_data
X-squared = 454.49, df = 2, p-value < 2.2e-16
Chi-Square test for pdays_cat
Pearson's Chi-squared test with Yates' continuity correction
data: table_data
X-squared = 448.22, df = 1, p-value < 2.2e-16
Chi-Square test for previous_cat
Pearson's Chi-squared test
data: table_data
X-squared = 258.52, df = 2, p-value < 2.2e-16
Interpretation:
These variables have a p-value > 0.05, meaning they may not be strong predictors: - Housing (𝑝= 0.7307) - Loan (𝑝= 0.5684) - Day of Week (𝑝 = 0.9723) Based on chi-square test, These variables do not significantly impact y, same can be visualized in proportion based stacked bar plot as well –> we may consider removing them from the model to reduce noise.
i. Removing less significant features
bank_data <- subset(bank_data, select = -c(housing,loan,day_of_week))D. Plots to analyze highly collinear variables
i. Plot for Relationship between emp.var.rate, nr.employed, and euribor3m
ggplot(bank_data, aes(x = emp.var.rate, y = nr.employed, color = euribor3m)) +
geom_point(alpha = 0.7, size = 3) +
scale_color_gradient(low = "blue", high = "red") +
labs(title = "Relationship between emp.var.rate, nr.employed, and euribor3m",
x = "emp.var.rate", y = "nr.employed", color = "euribor3m") +
theme_minimal()inference : These visual cues suggest that all three variables move together
ii. Combination variable creation for the 3 collinear variables
# Equal-weight combination of the three scaled variables
bank_data$comb_var <- scale(bank_data$emp.var.rate) + scale(bank_data$euribor3m) + scale(bank_data$nr.employed)# Remove the original variables after creating the linear combination
#bank_data_filtered <- subset(bank_data, select = -c(emp_var_rate_bin,campaign_bin,euribor3m_bin,comb_var))
bank_data_filtered <- subset(bank_data, select = -c(emp.var.rate, euribor3m, nr.employed))4. Data Modeling
i. Train Test Split
bank_data_filtered$y <- as.factor(bank_data_filtered$y)
# Split the dataset into training and testing sets
set.seed(101)
trainIndex <- createDataPartition(bank_data_filtered$y, p = 0.8, list = FALSE, times = 1)
train_data <- bank_data_filtered[trainIndex, ]
test_data <- bank_data_filtered[-trainIndex, ]ii. Outlier Treatment using IQR method
numeric_vars <- names(bank_data_filtered)[sapply(bank_data_filtered, is.numeric)]
for (var in numeric_vars) {
Q1 <- quantile(train_data[[var]], 0.25, na.rm = TRUE)
Q3 <- quantile(train_data[[var]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
train_data[[var]] <- pmin(pmax(train_data[[var]], lower_bound), upper_bound)
test_data[[var]] <- pmin(pmax(test_data[[var]], lower_bound), upper_bound)
}iii. Feature scaling
# Fit the pre-processing (scaling) on the train data
preprocess_params <- preProcess(train_data[, numeric_vars], method = c("center", "scale"))
train_data[, numeric_vars] <- predict(preprocess_params, newdata = train_data[, numeric_vars])
test_data[, numeric_vars] <- predict(preprocess_params, newdata = test_data[, numeric_vars])
train_data$y <- factor(train_data$y, levels = c(0, 1), labels = c("No", "Yes"))
test_data$y <- factor(test_data$y, levels = c(0, 1), labels = c("No", "Yes"))baseline
# Train Logistic Regression (baseline model)
logistic_model <- train(y ~ ., data = train_data, method = "glm", family = "binomial", trControl = trainControl(method = "cv", number = 5, verboseIter = FALSE))
# Train LDA (baseline model)
lda_model <- train(y ~ ., data = train_data, method = "lda", trControl = trainControl(method = "cv", number = 5, verboseIter = FALSE))Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
# Train Random Forest (baseline model)
rf_model <- train(y ~ ., data = train_data, method = "rf", trControl = trainControl(method = "cv", number = 5, verboseIter = FALSE))# Predictions for test_data
pred_logit <- predict(logistic_model, newdata = test_data)
pred_lda <- predict(lda_model, newdata = test_data)
pred_rf <- predict(rf_model, newdata = test_data)
# Confusion matrices
cm_logit <- confusionMatrix(pred_logit, test_data$y)
cm_lda <- confusionMatrix(pred_lda, test_data$y)
cm_rf <- confusionMatrix(pred_rf, test_data$y)
# Print classification metrics (Precision, Recall, F1)
cat("Logistic Regression Metrics:\n")Logistic Regression Metrics:
print(cm_logit$byClass) Sensitivity Specificity Pos Pred Value
0.9809004 0.2666667 0.9159236
Neg Pred Value Precision Recall
0.6315789 0.9159236 0.9809004
F1 Prevalence Detection Rate
0.9472991 0.8906440 0.8736330
Detection Prevalence Balanced Accuracy
0.9538275 0.6237835
cat("\nLDA Metrics:\n")
LDA Metrics:
print(cm_lda$byClass) Sensitivity Specificity Pos Pred Value
0.9631651 0.3888889 0.9277267
Neg Pred Value Precision Recall
0.5645161 0.9277267 0.9631651
F1 Prevalence Detection Rate
0.9451138 0.8906440 0.8578372
Detection Prevalence Balanced Accuracy
0.9246659 0.6760270
cat("\nRandom Forest Metrics:\n")
Random Forest Metrics:
print(cm_rf$byClass) Sensitivity Specificity Pos Pred Value
0.9918145 0.2000000 0.9098874
Neg Pred Value Precision Recall
0.7500000 0.9098874 0.9918145
F1 Prevalence Detection Rate
0.9490862 0.8906440 0.8833536
Detection Prevalence Balanced Accuracy
0.9708384 0.5959072
# AUC-ROC Calculation for weighted models
logit_probs <- predict(logistic_model, newdata = test_data, type = "prob")
lda_probs <- predict(lda_model, newdata = test_data, type = "prob")
rf_probs <- predict(rf_model, newdata = test_data, type = "prob")
# Calculate AUC for weighted models
auc_logit <- auc(roc(test_data$y, logit_probs$Yes))Setting levels: control = No, case = Yes
Setting direction: controls < cases
auc_lda <- auc(roc(test_data$y, lda_probs$Yes))Setting levels: control = No, case = Yes
Setting direction: controls < cases
auc_rf <- auc(roc(test_data$y, rf_probs$Yes))Setting levels: control = No, case = Yes
Setting direction: controls < cases
# Print AUC for weighted models
cat("AUC for Logistic Regression Weighted:", auc_logit, "\n")AUC for Logistic Regression Weighted: 0.7857663
cat("AUC for LDA Weighted:", auc_lda, "\n")AUC for LDA Weighted: 0.788328
cat("AUC for Random Forest Weighted:", auc_rf, "\n")AUC for Random Forest Weighted: 0.7568971
custom weights
train_control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary, verboseIter = FALSE)
# Compute class distribution
class_counts <- table(train_data$y)
total_samples <- sum(class_counts)
# Compute class weights (inverse of class frequencies)
class_weights <- total_samples / (length(class_counts) * class_counts)
# Train Logistic Regression with weights
logistic_weighted <- train(y ~ ., data = train_data, method = "glm", family = "binomial",
trControl = train_control, weights = class_weights[train_data$y])Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
# Train LDA with weights
lda_weighted <- train(y ~ ., data = train_data, method = "lda",
trControl = train_control, weights = class_weights[train_data$y])Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
# Train Random Forest with weights
rf_weighted <- train(y ~ ., data = train_data, method = "rf",
trControl = train_control, weights = class_weights[train_data$y])Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
in the result set. ROC will be used instead.
# Predictions for weighted models
pred_logit_weighted <- predict(logistic_weighted, newdata = test_data)
pred_lda_weighted <- predict(lda_weighted, newdata = test_data)
pred_rf_weighted <- predict(rf_weighted, newdata = test_data)
# Get confusion matrices
cm_logit_weighted <- confusionMatrix(pred_logit_weighted, test_data$y)
cm_lda_weighted <- confusionMatrix(pred_lda_weighted, test_data$y)
cm_rf_weighted <- confusionMatrix(pred_rf_weighted, test_data$y)
# Print Classification Report (Precision, Recall, F1)
print(cm_logit_weighted$byClass) Sensitivity Specificity Pos Pred Value
0.8376535 0.6222222 0.9475309
Neg Pred Value Precision Recall
0.3200000 0.9475309 0.8376535
F1 Prevalence Detection Rate
0.8892107 0.8906440 0.7460510
Detection Prevalence Balanced Accuracy
0.7873633 0.7299379
print(cm_lda_weighted$byClass) Sensitivity Specificity Pos Pred Value
0.9631651 0.3888889 0.9277267
Neg Pred Value Precision Recall
0.5645161 0.9277267 0.9631651
F1 Prevalence Detection Rate
0.9451138 0.8906440 0.8578372
Detection Prevalence Balanced Accuracy
0.9246659 0.6760270
print(cm_rf_weighted$byClass) Sensitivity Specificity Pos Pred Value
0.9904502 0.2000000 0.9097744
Neg Pred Value Precision Recall
0.7200000 0.9097744 0.9904502
F1 Prevalence Detection Rate
0.9483997 0.8906440 0.8821385
Detection Prevalence Balanced Accuracy
0.9696233 0.5952251
# AUC-ROC Calculation for weighted models
logit_weighted_probs <- predict(logistic_weighted, newdata = test_data, type = "prob")
lda_weighted_probs <- predict(lda_weighted, newdata = test_data, type = "prob")
rf_weighted_probs <- predict(rf_weighted, newdata = test_data, type = "prob")
# Calculate AUC for weighted models
auc_logit_weighted <- auc(roc(test_data$y, logit_weighted_probs$Yes))Setting levels: control = No, case = Yes
Setting direction: controls < cases
auc_lda_weighted <- auc(roc(test_data$y, lda_weighted_probs$Yes))Setting levels: control = No, case = Yes
Setting direction: controls < cases
auc_rf_weighted <- auc(roc(test_data$y, rf_weighted_probs$Yes))Setting levels: control = No, case = Yes
Setting direction: controls < cases
# Print AUC for weighted models
cat("AUC for Logistic Regression Weighted:", auc_logit_weighted, "\n")AUC for Logistic Regression Weighted: 0.7800364
cat("AUC for LDA Weighted:", auc_lda_weighted, "\n")AUC for LDA Weighted: 0.788328
cat("AUC for Random Forest Weighted:", auc_rf_weighted, "\n")AUC for Random Forest Weighted: 0.7659997
upsampling
cv_control_upsampled <- trainControl(method = "cv",number = 5,classProbs = TRUE, summaryFunction = twoClassSummary,sampling = "up", verboseIter = FALSE, savePredictions = "final")
model_logit_upsampled <- train(y ~ ., data = train_data,method = "glm",family = binomial,trControl = cv_control_upsampled,metric = "ROC")
model_lda_upsampled <- train(y ~ ., data = train_data,method = "lda",family = binomial,trControl = cv_control_upsampled,metric = "ROC") Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
Warning in lda.default(x, grouping, ...): variables are collinear
model_rf_upsampled <- train(y ~ ., data = train_data,method = "rf",family = binomial,trControl = cv_control_upsampled,metric ="ROC")# Predictions on test sample
pred_logit_upsampled <- predict(model_logit_upsampled, newdata = test_data)
pred_lda_upsampled <- predict(model_lda_upsampled, newdata = test_data)
pred_rf_upsampled <- predict(model_rf_upsampled, newdata = test_data)
# Get confusion matrices
cm_logit_upsampled <- confusionMatrix(pred_logit_upsampled, test_data$y)
cm_lda_upsampled <- confusionMatrix(pred_lda_upsampled, test_data$y)
cm_rf_upsampled <- confusionMatrix(pred_rf_upsampled, test_data$y)
# Print Classification Report (Precision, Recall, F1)
print(cm_logit_upsampled$byClass) Sensitivity Specificity Pos Pred Value
0.8485675 0.6222222 0.9481707
Neg Pred Value Precision Recall
0.3353293 0.9481707 0.8485675
F1 Prevalence Detection Rate
0.8956084 0.8906440 0.7557716
Detection Prevalence Balanced Accuracy
0.7970838 0.7353949
print(cm_lda_upsampled$byClass) Sensitivity Specificity Pos Pred Value
0.8321965 0.6333333 0.9486781
Neg Pred Value Precision Recall
0.3166667 0.9486781 0.8321965
F1 Prevalence Detection Rate
0.8866279 0.8906440 0.7411908
Detection Prevalence Balanced Accuracy
0.7812880 0.7327649
print(cm_rf_upsampled$byClass) Sensitivity Specificity Pos Pred Value
0.9017735 0.5555556 0.9429387
Neg Pred Value Precision Recall
0.4098361 0.9429387 0.9017735
F1 Prevalence Detection Rate
0.9218968 0.8906440 0.8031592
Detection Prevalence Balanced Accuracy
0.8517618 0.7286645
# AUC-ROC Calculation
logit_probs_upsampled <- predict(model_logit_upsampled, newdata = test_data, type = "prob")
lda_probs_upsampled <- predict(model_lda_upsampled, newdata = test_data, type = "prob")
rf_probs_upsampled <- predict(model_rf_upsampled, newdata = test_data, type = "prob")
# Calculate AUC
auc_logit_upsampled <- auc(roc(test_data$y, logit_probs_upsampled$Yes))Setting levels: control = No, case = Yes
Setting direction: controls < cases
auc_lda_upsampled <- auc(roc(test_data$y, lda_probs_upsampled$Yes))Setting levels: control = No, case = Yes
Setting direction: controls < cases
auc_rf_upsampled <- auc(roc(test_data$y, rf_probs_upsampled$Yes))Setting levels: control = No, case = Yes
Setting direction: controls < cases
# Print AUC for each model
cat("AUC for Logistic Regression:", auc_logit_upsampled, "\n")AUC for Logistic Regression: 0.7760952
cat("AUC for LDA:", auc_lda_upsampled, "\n")AUC for LDA: 0.7829923
cat("AUC for Random Forest:", auc_rf_upsampled, "\n")AUC for Random Forest: 0.7801804
Decision Thresholding
library(pROC)
library(caret)
library(ggplot2)
# Function to calculate F1-score from confusion matrix
get_f1_score <- function(preds, actuals) {
cm <- confusionMatrix(as.factor(preds), as.factor(actuals))
return(cm$byClass["F1"])
}
# Calculate ROC and AUC for each model
roc_logit <- roc(test_data$y, logit_probs_upsampled$Yes)Setting levels: control = No, case = Yes
Setting direction: controls < cases
roc_lda <- roc(test_data$y, lda_probs_upsampled$Yes)Setting levels: control = No, case = Yes
Setting direction: controls < cases
roc_rf <- roc(test_data$y, rf_probs_upsampled$Yes)Setting levels: control = No, case = Yes
Setting direction: controls < cases
# Sensitivity and Specificity for each model at different thresholds
sens_spec_logit <- data.frame(
threshold = roc_logit$thresholds,
sensitivity = roc_logit$sensitivities,
specificity = roc_logit$specificities
)
sens_spec_lda <- data.frame(
threshold = roc_lda$thresholds,
sensitivity = roc_lda$sensitivities,
specificity = roc_lda$specificities
)
sens_spec_rf <- data.frame(
threshold = roc_rf$thresholds,
sensitivity = roc_rf$sensitivities,
specificity = roc_rf$specificities
)
# Calculate the difference between sensitivity and specificity at each threshold
diff_logit <- abs(sens_spec_logit$sensitivity - sens_spec_logit$specificity)
diff_lda <- abs(sens_spec_lda$sensitivity - sens_spec_lda$specificity)
diff_rf <- abs(sens_spec_rf$sensitivity - sens_spec_rf$specificity)
# Find the threshold where this difference is minimized (best balance)
optimal_logit_threshold <- sens_spec_logit$threshold[which.min(diff_logit)]
optimal_lda_threshold <- sens_spec_lda$threshold[which.min(diff_lda)]
optimal_rf_threshold <- sens_spec_rf$threshold[which.min(diff_rf)]
cat("Optimal Threshold for Logistic Regression (Balance Sensitivity-Specificity):", optimal_logit_threshold, "\n")Optimal Threshold for Logistic Regression (Balance Sensitivity-Specificity): 0.3840407
cat("Optimal Threshold for LDA (Balance Sensitivity-Specificity):", optimal_lda_threshold, "\n")Optimal Threshold for LDA (Balance Sensitivity-Specificity): 0.3662219
cat("Optimal Threshold for Random Forest (Balance Sensitivity-Specificity):", optimal_rf_threshold, "\n")Optimal Threshold for Random Forest (Balance Sensitivity-Specificity): 0.222
# Plot ROC curves and Sensitivity vs 1-Specificity curves together
ggplot() +
# ROC curve for Logistic Regression
geom_line(data = data.frame(fpr = 1 - roc_logit$specificities, tpr = roc_logit$sensitivities),
aes(x = fpr, y = tpr), color = "blue", size = 1.2) +
# ROC curve for LDA
geom_line(data = data.frame(fpr = 1 - roc_lda$specificities, tpr = roc_lda$sensitivities),
aes(x = fpr, y = tpr), color = "red", size = 1.2) +
# ROC curve for Random Forest
geom_line(data = data.frame(fpr = 1 - roc_rf$specificities, tpr = roc_rf$sensitivities),
aes(x = fpr, y = tpr), color = "green", size = 1.2) +
# Sensitivity vs 1-Specificity curve for Logistic Regression
geom_line(data = sens_spec_logit, aes(x = 1 - specificity, y = sensitivity), linetype = "dashed", color = "blue") +
# Sensitivity vs 1-Specificity curve for LDA
geom_line(data = sens_spec_lda, aes(x = 1 - specificity, y = sensitivity), linetype = "dashed", color = "red") +
# Sensitivity vs 1-Specificity curve for Random Forest
geom_line(data = sens_spec_rf, aes(x = 1 - specificity, y = sensitivity), linetype = "dashed", color = "green") +
# Add optimal threshold lines (dotted vertical lines)
geom_vline(xintercept = 1 - optimal_logit_threshold, color = "blue", linetype = "dotted", size = 1) +
geom_vline(xintercept = 1 - optimal_lda_threshold, color = "red", linetype = "dotted", size = 1) +
geom_vline(xintercept = 1 - optimal_rf_threshold, color = "green", linetype = "dotted", size = 1) +
# Add labels and title
labs(title = "ROC and Sensitivity-Specificity Curves with Optimal Thresholds (Balance)",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)") +
# Adjust axis limits for visibility
xlim(0, 1) +
ylim(0, 1) +
theme_minimal() +
theme(legend.title = element_blank()) +
scale_color_manual(values = c("blue", "red", "green"), labels = c("Logistic Regression", "LDA", "Random Forest")) +
scale_linetype_manual(values = c("solid", "solid", "solid"), labels = c("ROC", "ROC", "ROC")) +
guides(color = guide_legend(title = "Model"), linetype = guide_legend(title = "Curve"))Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
# Function to calculate performance metrics at a specific threshold
calculate_metrics_at_threshold <- function(pred_probs, actuals, threshold) {
# Convert probabilities to class predictions based on the threshold
preds <- ifelse(pred_probs > threshold, "Yes", "No")
# Create confusion matrix
cm <- confusionMatrix(as.factor(preds), as.factor(actuals))
# Extract relevant metrics from the confusion matrix
metrics <- cm$byClass
metrics["Threshold"] <- threshold
return(metrics)
}
# Calculate metrics for Logistic Regression at optimal threshold
logit_metrics <- calculate_metrics_at_threshold(logit_probs_upsampled$Yes, test_data$y, optimal_logit_threshold)
# Calculate metrics for LDA at optimal threshold
lda_metrics <- calculate_metrics_at_threshold(lda_probs_upsampled$Yes, test_data$y, optimal_lda_threshold)
# Calculate metrics for Random Forest at optimal threshold
rf_metrics <- calculate_metrics_at_threshold(rf_probs_upsampled$Yes, test_data$y, optimal_rf_threshold)
# Combine all metrics into a data frame
all_metrics <- rbind(logit_metrics, lda_metrics, rf_metrics)
rownames(all_metrics) <- c("Logistic Regression", "LDA", "Random Forest")
# Print the metrics
print(all_metrics) Sensitivity Specificity Pos Pred Value Neg Pred Value
Logistic Regression 0.6780355 0.6777778 0.9448669 0.2053872
LDA 0.6998636 0.7000000 0.9500000 0.2226148
Random Forest 0.7107776 0.7111111 0.9524680 0.2318841
Precision Recall F1 Prevalence Detection Rate
Logistic Regression 0.9448669 0.6780355 0.7895155 0.890644 0.6038882
LDA 0.9500000 0.6998636 0.8059701 0.890644 0.6233293
Random Forest 0.9524680 0.7107776 0.8140625 0.890644 0.6330498
Detection Prevalence Balanced Accuracy Threshold
Logistic Regression 0.6391252 0.6779066 0.3840407
LDA 0.6561361 0.6999318 0.3662219
Random Forest 0.6646416 0.7109444 0.2220000
Random Forest has the best overall performance in terms of sensitivity, specificity, precision, and F1 score at its optimal threshold.