A Portuguese bank conducted a marketing campaign (phone calls) to predict if a client will subscribe to a term deposit The records of their efforts are available in the form of a dataset. The objective here is to apply machine learning techniques to analyze the dataset and figure out most effective tactics that will help the bank in next campaign to persuade more customers to subscribe to the bank’s term deposit.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv('https://raw.githubusercontent.com/ddebonis47/classwork/refs/heads/main/bank-full.csv', sep = ';', stringsAsFactors = TRUE)
# all copied and pasted from previous assignment
median_prev <- median(data$previous, na.rm = TRUE)
data$previous <- ifelse(data$previous == 275, median_prev, data$previous)
data$pdays <- as.numeric(data$pdays)
data$pdays[is.na(data$pdays)] <- -1
data <- data |>
mutate(poutcome=na_if(poutcome, "unknown"))
data <- data |>
mutate(job=na_if(job, "unknown"))
data <- data |>
mutate(education=na_if(education, "unknown"))
data <- data |>
mutate(contact=na_if(contact, "unknown"))
### changed coding of date based on feedback
data <- data |>
mutate(
month_factor = factor(month, levels = tolower(month.abb), ordered = TRUE),
# Assign quarters based on month
quarter = case_when(
month_factor %in% c("jan","feb","mar") ~ "Q1",
month_factor %in% c("apr","may","jun") ~ "Q2",
month_factor %in% c("jul","aug","sep") ~ "Q3",
month_factor %in% c("oct","nov","dec") ~ "Q4"
),
# Assign seasons
season = case_when(
month_factor %in% c("dec","jan","feb") ~ "Winter",
month_factor %in% c("mar","apr","may") ~ "Spring",
month_factor %in% c("jun","jul","aug") ~ "Summer",
month_factor %in% c("sep","oct","nov") ~ "Fall"
),
# Bucket day into early/mid/late month
day_group = case_when(
day <= 10 ~ "Early",
day <= 20 ~ "Mid",
TRUE ~ "Late"
)
)
### ensuring removal of duration
data <- data |>
select(-duration)
data$total_contacts <- data$campaign + data$previous
data$contact_ratio <- data$campaign / pmax(data$previous, 1)
data$contacts_per_campaign <- with(data,
ifelse((campaign + previous) > 0,
campaign / (campaign + previous),
NA))
data$contacts_per_campaign[is.na(data$contacts_per_campaign)] <- 0
# Columns where unknown or NA should be a single level
cat_vars <- c("poutcome","job","education","contact","quarter","season","day_group")
for(col in cat_vars){
data[[col]] <- as.character(data[[col]])
data[[col]][is.na(data[[col]])] <- "unknown" # replace NAs
data[[col]][data[[col]]=="unknown"] <- "unknown" # replace any lingering "unknown"
data[[col]] <- factor(data[[col]]) # convert back to factor
}
library(rpart)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(PRROC)
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
set.seed(24601)
# Train / Test Split
n_runs <- 30
results_list <- vector("list", n_runs)
# Aligning Factors
#for(col in cat_vars){
# test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
#}
# Does the maximum depth of trees impact PR-AUC, Precision, Recall, or F1 score?
# Function to train and evaluate DT
run_dt_experiment <- function(train_data, test_data, maxdepth) {
dt_model <- rpart(y ~ ., data = train_data, method = "class",
control = rpart.control(maxdepth = maxdepth, cp = 0.01))
pred_probs <- predict(dt_model, test_data, type = "prob")[, "yes"]
# PR-AUC
fg <- pred_probs[test_data$y == "yes"]
bg <- pred_probs[test_data$y == "no"]
pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
# Precision, Recall, F1 at 0.5 threshold
pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
conf_matrix <- confusionMatrix(factor(pred_class, levels=c("no","yes")),
factor(test_data$y, levels=c("no","yes")))
list(pr_auc = pr$auc.integral,
precision = conf_matrix$byClass["Precision"],
recall = conf_matrix$byClass["Recall"],
f1 = conf_matrix$byClass["F1"])
}
# Generate all 30 splits
set.seed(123)
n_runs <- 30
splits_list <- vector("list", n_runs)
for (i in 1:n_runs) {
train_index <- sample(seq_len(nrow(data)), size = 0.7 * nrow(data))
splits_list[[i]] <- list(
train = data[train_index, ],
test = data[-train_index, ]
)
}
# Run experiments for multiple depths on the same splits
depths <- c(5, 10)
results_all <- list()
for (d in depths) {
results_all[[paste0("depth_", d)]] <- lapply(splits_list, function(split) {
run_dt_experiment(split$train, split$test, maxdepth = d)
})
}
summarize_results <- function(results_list) {
df <- do.call(rbind, lapply(results_list, as.data.frame))
data.frame(
metric = colnames(df),
mean = apply(df, 2, mean),
sd = apply(df, 2, sd)
)
}
summary_depth5 <- summarize_results(results_all[["depth_5"]])
summary_depth10 <- summarize_results(results_all[["depth_10"]])
summary_depth5
## metric mean sd
## pr_auc pr_auc 0.2505872 0.008995448
## precision precision 0.9011401 0.002211405
## recall recall 0.9866902 0.000803682
## f1 f1 0.9419749 0.001142619
summary_depth10
## metric mean sd
## pr_auc pr_auc 0.2505872 0.008995448
## precision precision 0.9011401 0.002211405
## recall recall 0.9866902 0.000803682
## f1 f1 0.9419749 0.001142619
The results were identical! It appears that the trees are not going deep enough for this distinction to make any difference.
# Since we have so few 'yes' values, how does applying class weights to increase the presence of 'yes' values affect PR-AUC, precision, recall, and F1 score?
run_dt_weighted <- function(train_data, test_data, maxdepth = 5) {
# Compute class weights
class_counts <- table(train_data$y)
weights <- ifelse(train_data$y == "yes",
as.numeric(class_counts["no"] / class_counts["yes"]),
1)
# Train the model
dt_model <- rpart(y ~ ., data = train_data, method = "class",
weights = weights,
control = rpart.control(maxdepth = maxdepth, cp = 0.01))
# Predict probabilities
pred_probs <- predict(dt_model, test_data, type = "prob")[, "yes"]
# PR-AUC
fg <- pred_probs[test_data$y == "yes"]
bg <- pred_probs[test_data$y == "no"]
pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
# Precision, Recall, F1 at threshold 0.5
pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
conf_matrix <- confusionMatrix(factor(pred_class, levels=c("no","yes")),
factor(test_data$y, levels=c("no","yes")))
# Return metrics
list(
pr_auc = pr$auc.integral,
precision = conf_matrix$byClass["Precision"],
recall = conf_matrix$byClass["Recall"],
f1 = conf_matrix$byClass["F1"]
)
}
# Run 30 experiments
results_weighted <- lapply(splits_list, function(split) {
run_dt_weighted(split$train, split$test, maxdepth = 5)
})
# Results
metrics_df <- do.call(rbind, lapply(results_weighted, as.data.frame))
summary_stats <- data.frame(
metric = colnames(metrics_df),
mean = apply(metrics_df, 2, mean),
sd = apply(metrics_df, 2, sd)
)
summary_stats
## metric mean sd
## pr_auc pr_auc 0.3500296 0.012530847
## precision precision 0.9292679 0.002607405
## recall recall 0.8646072 0.026364502
## f1 f1 0.8955405 0.013369177
PR-AUC increases as does precision to a lesser extent. Recall and F1 decrease. This is likely due to a decrease in overfitting that the weighting caused. This second model is more generalizable because the predictions are less influenced by the fact that the majority of them are ‘no.’ This is why we see an improvement in overall performance with the increase in PR-AUC, while also seeing a notable drop in recall.
# Does increasing the number of trees improve predictive performance?
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(24601)
# Function to run one Random Forest experiment with arbitrary ntree
run_rf_ntree <- function(train_data, test_data, ntree = 500) {
rf_model <- randomForest(
y ~ ., data = train_data, ntree = ntree)
# Predict probabilities
pred_probs <- predict(rf_model, test_data, type = "prob")[, "yes"]
# PR-AUC
fg <- pred_probs[test_data$y == "yes"]
bg <- pred_probs[test_data$y == "no"]
pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
# Precision, Recall, F1 at 0.5 threshold
pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
conf_matrix <- confusionMatrix(factor(pred_class, levels=c("no","yes")),
factor(test_data$y, levels=c("no","yes")))
list(
ntree = ntree,
pr_auc = pr$auc.integral,
precision = conf_matrix$byClass["Precision"],
recall = conf_matrix$byClass["Recall"],
f1 = conf_matrix$byClass["F1"]
)
}
# Run across 30 splits
ntree_values <- c(100, 500)
results_rf_ntree <- list()
for (nt in ntree_values) {
results_rf_ntree[[paste0("ntree_", nt)]] <- lapply(splits_list, function(split) {
run_rf_ntree(split$train, split$test, ntree = nt)
})
}
# Results
summary_rf_ntree <- lapply(results_rf_ntree, function(res_list) {
metrics_df <- do.call(rbind, lapply(res_list, as.data.frame))
data.frame(
metric = colnames(metrics_df),
mean = apply(metrics_df, 2, mean),
sd = apply(metrics_df, 2, sd)
)
})
summary_rf_ntree
## $ntree_100
## metric mean sd
## ntree ntree 100.0000000 0.000000000
## pr_auc pr_auc 0.4344185 0.011471173
## precision precision 0.9076175 0.002754735
## recall recall 0.9789186 0.001700386
## f1 f1 0.9419169 0.001382787
##
## $ntree_500
## metric mean sd
## ntree ntree 500.0000000 0.000000000
## pr_auc pr_auc 0.4397201 0.010625560
## precision precision 0.9079014 0.002625369
## recall recall 0.9792888 0.001678334
## f1 f1 0.9422415 0.001323236
Overall, the differences shown between these results are very slight. The PR-AUC increases by about .006. That being said, the differences in computation time did not seem notable either. At 30 iterations, both took considerable amounts of time.
# Just as before, can we improve accuracy of our model by applying class weights?
# Function to run one Random Forest experiment
run_rf_weighted <- function(train_data, test_data, ntree = 100) {
# Compute class weights
class_counts <- table(train_data$y)
classwt <- as.numeric(class_counts["no"] / class_counts)
names(classwt) <- names(class_counts)
# Train Random Forest
rf_model <- randomForest(y ~ ., data = train_data, ntree = ntree,
classwt = classwt)
# Predict probabilities
pred_probs <- predict(rf_model, test_data, type = "prob")[, "yes"]
# PR-AUC
fg <- pred_probs[test_data$y == "yes"]
bg <- pred_probs[test_data$y == "no"]
pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
# Precision, Recall, F1 at 0.5
pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
conf_matrix <- confusionMatrix(factor(pred_class, levels=c("no","yes")),
factor(test_data$y, levels=c("no","yes")))
list(
pr_auc = pr$auc.integral,
precision = conf_matrix$byClass["Precision"],
recall = conf_matrix$byClass["Recall"],
f1 = conf_matrix$byClass["F1"]
)
}
# Run across the 30 splits
results_rf <- lapply(splits_list, function(split) {
run_rf_weighted(split$train, split$test, ntree = 100)
})
# Results
metrics_rf <- do.call(rbind, lapply(results_rf, as.data.frame))
summary_rf <- data.frame(
metric = colnames(metrics_rf),
mean = apply(metrics_rf, 2, mean),
sd = apply(metrics_rf, 2, sd)
)
summary_rf
## metric mean sd
## pr_auc pr_auc 0.3362144 0.010147283
## precision precision 0.9117388 0.002408161
## recall recall 0.9450237 0.003839002
## f1 f1 0.9280761 0.001871955
The most notable difference here is the drop in PR-AUC, which is particularly surprising because the values for precision and recall are fairly similar across the two models, with recall taking a tumble. Possibly the weights did not improve the discrimination of the model because the bootstrapping used in random forest models handles these imbalances well on its own.
# Class weights helped a lot in decision trees, but actually hurt model performance using random forests. How does weighting impact the precision and recall when using AdaBoost?
library(adabag)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
set.seed(24601)
train_index <- createDataPartition(data$y, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Compute class weights (used for weighted AdaBoost)
class_counts <- table(train_data$y)
class_weights <- c(
"no" = 1,
"yes" = as.numeric(class_counts["no"] / class_counts["yes"])
)
n_runs <- 30
results_list <- vector("list", n_runs * 2) # twice as many for baseline + weighted
for (i in 1:n_runs) {
# Ensure factor levels match
cat_cols <- names(train_data)[sapply(train_data, is.factor)]
for (col in cat_cols) {
test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
}
# Baseline AdaBoost
ab_baseline <- boosting(
y ~ .,
data = train_data,
mfinal = 50,
control = rpart.control(maxdepth = 3)
)
pred_ab <- predict.boosting(ab_baseline, newdata = test_data)
if (is.null(dimnames(pred_ab$prob))) colnames(pred_ab$prob) <- levels(train_data$y)
pred_probs <- pred_ab$prob[, "yes"]
fg <- pred_probs[test_data$y == "yes"]
bg <- pred_probs[test_data$y == "no"]
pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
cm <- confusionMatrix(factor(pred_class, levels = c("no", "yes")),
factor(test_data$y, levels = c("no", "yes")))
results_list[[i]] <- list(
experiment = "baseline",
pr_auc = pr$auc.integral,
precision = cm$byClass["Precision"],
recall = cm$byClass["Recall"],
f1 = cm$byClass["F1"]
)
# AdaBoost with Weights
case_weights <- ifelse(train_data$y == "yes", class_weights["yes"], class_weights["no"])
ab_weighted <- boosting(
y ~ .,
data = train_data,
mfinal = 50,
control = rpart.control(maxdepth = 3),
weights = case_weights,
boos = TRUE,
coeflearn = "Zhu"
)
pred_ab <- predict.boosting(ab_weighted, newdata = test_data)
if (is.null(dimnames(pred_ab$prob))) colnames(pred_ab$prob) <- levels(train_data$y)
pred_probs <- pred_ab$prob[, "yes"]
fg <- pred_probs[test_data$y == "yes"]
bg <- pred_probs[test_data$y == "no"]
pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
cm <- confusionMatrix(factor(pred_class, levels = c("no", "yes")),
factor(test_data$y, levels = c("no", "yes")))
results_list[[n_runs + i]] <- list(
experiment = "weighted",
pr_auc = pr$auc.integral,
precision = cm$byClass["Precision"],
recall = cm$byClass["Recall"],
f1 = cm$byClass["F1"]
)
}
# Combine and summarize results
results_df <- bind_rows(lapply(results_list, as.data.frame))
summary_ab <- results_df %>%
group_by(experiment) %>%
summarize(
mean_pr_auc = mean(pr_auc),
sd_pr_auc = sd(pr_auc),
mean_precision = mean(precision),
sd_precision = sd(precision),
mean_recall = mean(recall),
sd_recall = sd(recall),
mean_f1 = mean(f1),
sd_f1 = sd(f1)
)
summary_ab
## # A tibble: 2 × 9
## experiment mean_pr_auc sd_pr_auc mean_precision sd_precision mean_recall
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 baseline 0.428 0.00373 0.903 0.000961 0.983
## 2 weighted 0.429 0.00547 0.905 0.000749 0.982
## # ℹ 3 more variables: sd_recall <dbl>, mean_f1 <dbl>, sd_f1 <dbl>
Here, the results are much closer, and we see a slight increase in PR AUC when applying weights. This makes sense given how AdaBoost operates, focusing on mislabeled items after each iteration. This already handles unbalanced data so the weighting is generally superfluous.
# Will a greater number of trees increase PR AUC?
tree_counts <- c(100, 200)
results_list <- list()
for (nt in tree_counts) {
for (i in 1:n_runs) {
# Ensure factor levels match
cat_cols <- names(train_data)[sapply(train_data, is.factor)]
for (col in cat_cols) {
test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
}
# Train AdaBoost
ab_model <- boosting(
y ~ .,
data = train_data,
mfinal = nt,
control = rpart.control(maxdepth = 3),
boos = TRUE,
coeflearn = "Zhu"
)
pred_ab <- predict.boosting(ab_model, newdata = test_data)
# Ensure probability matrix has column names
if (is.null(dimnames(pred_ab$prob))) {
colnames(pred_ab$prob) <- levels(train_data$y)
}
pred_probs <- pred_ab$prob[, "yes"]
# PR-AUC
fg <- pred_probs[test_data$y == "yes"]
bg <- pred_probs[test_data$y == "no"]
pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
# Threshold 0.5 for confusion matrix
pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
cm <- confusionMatrix(factor(pred_class, levels = c("no","yes")),
factor(test_data$y, levels = c("no","yes")))
# Save
results_list[[paste0("ntree_", nt, "_run_", i)]] <- list(
mfinal = nt,
pr_auc = pr$auc.integral,
precision = cm$byClass["Precision"],
recall = cm$byClass["Recall"],
f1 = cm$byClass["F1"]
)
}
}
# Combine into a single dataframe
results_df <- bind_rows(lapply(results_list, as.data.frame))
# Results
summary_ab_ntrees <- results_df %>%
group_by(mfinal) %>%
summarize(
mean_pr_auc = mean(pr_auc),
sd_pr_auc = sd(pr_auc),
mean_precision = mean(precision),
sd_precision = sd(precision),
mean_recall = mean(recall),
sd_recall = sd(recall),
mean_f1 = mean(f1),
sd_f1 = sd(f1)
)
summary_ab_ntrees
## # A tibble: 2 × 9
## mfinal mean_pr_auc sd_pr_auc mean_precision sd_precision mean_recall sd_recall
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 100 0.433 0.00563 0.905 0.00168 0.983 0.00209
## 2 200 0.441 0.00480 0.905 0.00119 0.983 0.00145
## # ℹ 2 more variables: mean_f1 <dbl>, sd_f1 <dbl>
Experimenting with the number of trees showed a small increase in PR-AUC, indicating marginally improved probability ranking, while other metrics remained stable.