Our objective is to predict term deposit subscriptions with a
strategic perspective: - Business Goal: Optimize
marketing campaign efficiency - Target Variable:
y (term deposit subscription) - Prediction
Perspective: * Minimize unnecessary calls * Maximize conversion
probability * Reduce customer acquisition cost
# Load full dataset
data <- read.csv("bank-full.csv", sep = ";", stringsAsFactors = FALSE)
# Basic data preprocessing
data <- data %>%
mutate(
# Convert target variable to factor
y = factor(y, levels = c("no", "yes")),
# Convert binary variables to factor
across(c(default, housing, loan), ~factor(., levels = c("no", "yes"))),
# Convert categorical variables to factor
across(c(job, marital, education, contact, month, poutcome), as.factor)
)
# Check data structure
str(data)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
## $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
# Initial data overview
glimpse(data)
## Rows: 45,211
## Columns: 17
## $ age <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job <fct> management, technician, entrepreneur, blue-collar, unknown, …
## $ marital <fct> married, single, married, married, single, married, single, …
## $ education <fct> tertiary, secondary, secondary, unknown, unknown, tertiary, …
## $ default <fct> no, no, no, no, no, no, no, yes, no, no, no, no, no, no, no,…
## $ balance <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing <fct> yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes, yes, y…
## $ loan <fct> no, no, yes, no, no, no, yes, no, no, no, no, no, no, no, no…
## $ contact <fct> unknown, unknown, unknown, unknown, unknown, unknown, unknow…
## $ day <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month <fct> may, may, may, may, may, may, may, may, may, may, may, may, …
## $ duration <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome <fct> unknown, unknown, unknown, unknown, unknown, unknown, unknow…
## $ y <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, …
summary(data)
## age job marital education
## Min. :18.00 blue-collar:9732 divorced: 5207 primary : 6851
## 1st Qu.:33.00 management :9458 married :27214 secondary:23202
## Median :39.00 technician :7597 single :12790 tertiary :13301
## Mean :40.94 admin. :5171 unknown : 1857
## 3rd Qu.:48.00 services :4154
## Max. :95.00 retired :2264
## (Other) :6835
## default balance housing loan contact
## no :44396 Min. : -8019 no :20081 no :37967 cellular :29285
## yes: 815 1st Qu.: 72 yes:25130 yes: 7244 telephone: 2906
## Median : 448 unknown :13020
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
##
## day month duration campaign
## Min. : 1.00 may :13766 Min. : 0.0 Min. : 1.000
## 1st Qu.: 8.00 jul : 6895 1st Qu.: 103.0 1st Qu.: 1.000
## Median :16.00 aug : 6247 Median : 180.0 Median : 2.000
## Mean :15.81 jun : 5341 Mean : 258.2 Mean : 2.764
## 3rd Qu.:21.00 nov : 3970 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :31.00 apr : 2932 Max. :4918.0 Max. :63.000
## (Other): 6060
## pdays previous poutcome y
## Min. : -1.0 Min. : 0.0000 failure: 4901 no :39922
## 1st Qu.: -1.0 1st Qu.: 0.0000 other : 1840 yes: 5289
## Median : -1.0 Median : 0.0000 success: 1511
## Mean : 40.2 Mean : 0.5803 unknown:36959
## 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :871.0 Max. :275.0000
##
# Advanced feature engineering
data <- data %>%
mutate(
# Age categorization
age_group = cut(age,
breaks = c(0, 25, 40, 55, 100),
labels = c("Young", "Middle-Young", "Middle-Aged", "Senior")),
# Balance risk categorization
balance_risk = case_when(
balance < 0 ~ "Negative",
balance == 0 ~ "Zero",
balance > 0 & balance <= 1000 ~ "Low",
balance > 1000 & balance <= 5000 ~ "Medium",
TRUE ~ "High"
),
# Campaign intensity
contact_intensity = case_when(
campaign <= 2 ~ "Low",
campaign <= 5 ~ "Medium",
campaign > 5 ~ "High"
),
# Previous campaign performance
previous_campaign_performance = case_when(
poutcome == "success" ~ "Successful",
poutcome == "failure" ~ "Failed",
TRUE ~ "No Prior Contact"
)
) %>%
mutate(y = factor(y, levels = c("no", "yes")))
# Correlation and Information Gain based feature selection
selected_features <- c(
"age_group", "job", "marital", "education",
"balance_risk", "housing", "loan",
"contact", "contact_intensity",
"previous_campaign_performance"
)
# Prepare modeling dataset
model_data <- data[, c(selected_features, "y")]
set.seed(42)
train_index <- createDataPartition(model_data$y, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
# Class distribution check
table(train_data$y)
##
## no yes
## 27946 3703
table(test_data$y)
##
## no yes
## 11976 1586
# Stratified 10-fold cross-validation
control <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 3,
summaryFunction = prSummary,
classProbs = TRUE,
savePredictions = "final"
)
nb_model <- train(
y ~ .,
data = train_data,
method = "naive_bayes",
trControl = control,
metric = "AUC"
)
nb_predictions <- predict(nb_model, test_data)
nb_probs <- predict(nb_model, test_data, type = "prob")
nb_cm <- confusionMatrix(nb_predictions, test_data$y)
print(nb_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11976 1586
## yes 0 0
##
## Accuracy : 0.8831
## 95% CI : (0.8775, 0.8884)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 0.5067
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8831
## Neg Pred Value : NaN
## Prevalence : 0.8831
## Detection Rate : 0.8831
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : no
##
dt_model <- train(
y ~ .,
data = train_data,
method = "rpart",
trControl = control,
metric = "AUC",
tuneLength = 10
)
dt_predictions <- predict(dt_model, test_data)
dt_probs <- predict(dt_model, test_data, type = "prob")
dt_cm <- confusionMatrix(dt_predictions, test_data$y)
print(dt_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11824 1313
## yes 152 273
##
## Accuracy : 0.892
## 95% CI : (0.8866, 0.8972)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 0.0005709
##
## Kappa : 0.2336
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9873
## Specificity : 0.1721
## Pos Pred Value : 0.9001
## Neg Pred Value : 0.6424
## Prevalence : 0.8831
## Detection Rate : 0.8718
## Detection Prevalence : 0.9687
## Balanced Accuracy : 0.5797
##
## 'Positive' Class : no
##
# Decision Tree Visualization
rpart.plot(dt_model$finalModel, box.palette = "RdBu")
# ROC Curve
roc_nb <- roc(test_data$y, nb_probs$yes)
roc_dt <- roc(test_data$y, dt_probs$yes)
plot(roc_nb, col = "blue", main = "ROC Curves")
lines(roc_dt, col = "red")
legend("bottomright",
legend = c("Naive Bayes", "Decision Tree"),
col = c("blue", "red"),
lwd = 2)
performance_df <- data.frame(
Model = c("Naive Bayes", "Decision Tree"),
Accuracy = c(
nb_cm$overall["Accuracy"],
dt_cm$overall["Accuracy"]
),
Precision = c(
nb_cm$byClass["Precision"],
dt_cm$byClass["Precision"]
),
Recall = c(
nb_cm$byClass["Recall"],
dt_cm$byClass["Recall"]
),
F1_Score = c(
nb_cm$byClass["F1"],
dt_cm$byClass["F1"]
),
AUC = c(
auc(roc_nb),
auc(roc_dt)
)
)
print(performance_df)
## Model Accuracy Precision Recall F1_Score AUC
## 1 Naive Bayes 0.8830556 0.8830556 1.0000000 0.9378965 0.7288701
## 2 Decision Tree 0.8919776 0.9000533 0.9873079 0.9416637 0.7172295
# Parallel processing
cores <- detectCores() - 1
registerDoParallel(cores)
# Specific Random Forest control
rf_control <- trainControl(
method = "cv", # Cross-validation
number = 5, # Fewer folds
allowParallel = TRUE
)
# Constrain Random Forest parameters
rf_model <- train(
y ~ .,
data = train_data,
method = "rf",
trControl = rf_control,
metric = "Kappa",
ntree = 100, # Reduce number of trees
maxnodes = 50, # Limit tree complexity
tuneLength = 3 # Fewer tuning iterations
)
# Predictions
rf_predictions <- predict(rf_model, test_data)
rf_cm <- confusionMatrix(rf_predictions, test_data$y)
print(rf_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11823 1306
## yes 153 280
##
## Accuracy : 0.8924
## 95% CI : (0.8871, 0.8976)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 0.0003164
##
## Kappa : 0.2392
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9872
## Specificity : 0.1765
## Pos Pred Value : 0.9005
## Neg Pred Value : 0.6467
## Prevalence : 0.8831
## Detection Rate : 0.8718
## Detection Prevalence : 0.9681
## Balanced Accuracy : 0.5819
##
## 'Positive' Class : no
##
# Clean up parallel processing
stopImplicitCluster()
# Variable importance
plot(varImp(dt_model), main = "Feature Importance")
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Bangkok
## tzcode source: internal
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doParallel_1.0.17 iterators_1.0.14 foreach_1.5.2 rpart.plot_3.1.2
## [5] rpart_4.1.23 e1071_1.7-16 glmnet_4.1-8 Matrix_1.7-0
## [9] MASS_7.3-60.2 pROC_1.18.5 caret_6.0-94 lattice_0.22-6
## [13] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
## [17] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [21] ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4041.110 fastmap_1.2.0
## [4] digest_0.6.37 timechange_0.3.0 lifecycle_1.0.4
## [7] survival_3.6-4 ROCR_1.0-11 magrittr_2.0.3
## [10] compiler_4.4.1 rlang_1.1.4 sass_0.4.9
## [13] tools_4.4.1 utf8_1.2.4 yaml_2.3.10
## [16] data.table_1.16.2 knitr_1.48 plyr_1.8.9
## [19] withr_3.0.1 MLmetrics_1.1.3 nnet_7.3-19
## [22] grid_4.4.1 stats4_4.4.1 fansi_1.0.6
## [25] colorspace_2.1-1 future_1.34.0 globals_0.16.3
## [28] scales_1.3.0 cli_3.6.3 rmarkdown_2.28
## [31] generics_0.1.3 rstudioapi_0.16.0 future.apply_1.11.3
## [34] reshape2_1.4.4 tzdb_0.4.0 cachem_1.1.0
## [37] proxy_0.4-27 splines_4.4.1 vctrs_0.6.5
## [40] hardhat_1.4.0 jsonlite_1.8.8 hms_1.1.3
## [43] listenv_0.9.1 gower_1.0.1 jquerylib_0.1.4
## [46] recipes_1.1.0 glue_1.7.0 parallelly_1.38.0
## [49] codetools_0.2-20 stringi_1.8.4 gtable_0.3.5
## [52] shape_1.4.6.1 munsell_0.5.1 pillar_1.9.0
## [55] htmltools_0.5.8.1 naivebayes_1.0.0 randomForest_4.7-1.2
## [58] ipred_0.9-15 lava_1.8.0 R6_2.5.1
## [61] evaluate_0.24.0 highr_0.11 bslib_0.8.0
## [64] class_7.3-22 Rcpp_1.0.13 nlme_3.1-164
## [67] prodlim_2024.06.25 xfun_0.47 pkgconfig_2.0.3
## [70] ModelMetrics_1.2.2.2