622project2.knit

Assignment 2

Daniel DeBonis

Background

A Portuguese bank conducted a marketing campaign (phone calls) to predict if a client will subscribe to a term deposit The records of their efforts are available in the form of a dataset. The objective here is to apply machine learning techniques to analyze the dataset and figure out most effective tactics that will help the bank in next campaign to persuade more customers to subscribe to the bank’s term deposit.

Importing & Preprocessing Data from Part 1

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

data <- read.csv('https://raw.githubusercontent.com/ddebonis47/classwork/refs/heads/main/bank-full.csv', sep = ';', stringsAsFactors = TRUE)

# all copied and pasted from previous assignment
median_prev <- median(data$previous, na.rm = TRUE)
data$previous <- ifelse(data$previous == 275, median_prev, data$previous)
data$pdays <- as.numeric(data$pdays)
data$pdays[is.na(data$pdays)] <- -1
data <- data |>
   mutate(poutcome=na_if(poutcome, "unknown"))
data <- data |>
   mutate(job=na_if(job, "unknown"))
data <- data |>
   mutate(education=na_if(education, "unknown"))
data <- data |>
   mutate(contact=na_if(contact, "unknown"))

### changed coding of date based on feedback
data <- data |>
  mutate(
    month_factor = factor(month, levels = tolower(month.abb), ordered = TRUE),
    
    # Assign quarters based on month
    quarter = case_when(
      month_factor %in% c("jan","feb","mar") ~ "Q1",
      month_factor %in% c("apr","may","jun") ~ "Q2",
      month_factor %in% c("jul","aug","sep") ~ "Q3",
      month_factor %in% c("oct","nov","dec") ~ "Q4"
    ),
    
    # Assign seasons
    season = case_when(
      month_factor %in% c("dec","jan","feb") ~ "Winter",
      month_factor %in% c("mar","apr","may") ~ "Spring",
      month_factor %in% c("jun","jul","aug") ~ "Summer",
      month_factor %in% c("sep","oct","nov") ~ "Fall"
    ),
    
    # Bucket day into early/mid/late month
    day_group = case_when(
      day <= 10 ~ "Early",
      day <= 20 ~ "Mid",
      TRUE ~ "Late"
    )
  )
### ensuring removal of duration
data <- data |>
  select(-duration)

data$total_contacts <- data$campaign + data$previous
data$contact_ratio <- data$campaign / pmax(data$previous, 1)
data$contacts_per_campaign <- with(data,
                                   ifelse((campaign + previous) > 0,
                                          campaign / (campaign + previous),
                                          NA))
data$contacts_per_campaign[is.na(data$contacts_per_campaign)] <- 0

# Columns where unknown or NA should be a single level
cat_vars <- c("poutcome","job","education","contact","quarter","season","day_group")

for(col in cat_vars){
  data[[col]] <- as.character(data[[col]])        
  data[[col]][is.na(data[[col]])] <- "unknown"   # replace NAs
  data[[col]][data[[col]]=="unknown"] <- "unknown" # replace any lingering "unknown"
  data[[col]] <- factor(data[[col]])             # convert back to factor
}

Additional Prepartions

library(rpart)      
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(PRROC)

## Loading required package: rlang

## 
## Attaching package: 'rlang'

## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice

set.seed(24601)

# Train / Test Split
n_runs <- 30
results_list <- vector("list", n_runs)

# Aligning Factors
#for(col in cat_vars){
 # test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
#}

Decision Trees

Experiment 1 - Tree Depth

# Does the maximum depth of trees impact PR-AUC, Precision, Recall, or F1 score?
# Function to train and evaluate DT
run_dt_experiment <- function(train_data, test_data, maxdepth) {
  dt_model <- rpart(y ~ ., data = train_data, method = "class",
                    control = rpart.control(maxdepth = maxdepth, cp = 0.01))
  
  pred_probs <- predict(dt_model, test_data, type = "prob")[, "yes"]
  
  # PR-AUC
  fg <- pred_probs[test_data$y == "yes"]
  bg <- pred_probs[test_data$y == "no"]
  pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
  
  # Precision, Recall, F1 at 0.5 threshold
  pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
  conf_matrix <- confusionMatrix(factor(pred_class, levels=c("no","yes")),
                                 factor(test_data$y, levels=c("no","yes")))
  
  list(pr_auc = pr$auc.integral,
       precision = conf_matrix$byClass["Precision"],
       recall = conf_matrix$byClass["Recall"],
       f1 = conf_matrix$byClass["F1"])
}

# Generate all 30 splits
set.seed(123)
n_runs <- 30
splits_list <- vector("list", n_runs)

for (i in 1:n_runs) {
  train_index <- sample(seq_len(nrow(data)), size = 0.7 * nrow(data))
  splits_list[[i]] <- list(
    train = data[train_index, ],
    test  = data[-train_index, ]
  )
}

# Run experiments for multiple depths on the same splits
depths <- c(5, 10)  
results_all <- list()

for (d in depths) {
  results_all[[paste0("depth_", d)]] <- lapply(splits_list, function(split) {
    run_dt_experiment(split$train, split$test, maxdepth = d)
  })
}

summarize_results <- function(results_list) {
  df <- do.call(rbind, lapply(results_list, as.data.frame))
  data.frame(
    metric = colnames(df),
    mean = apply(df, 2, mean),
    sd = apply(df, 2, sd)
  )
}

summary_depth5 <- summarize_results(results_all[["depth_5"]])
summary_depth10 <- summarize_results(results_all[["depth_10"]])

summary_depth5

##              metric      mean          sd
## pr_auc       pr_auc 0.2505872 0.008995448
## precision precision 0.9011401 0.002211405
## recall       recall 0.9866902 0.000803682
## f1               f1 0.9419749 0.001142619

summary_depth10

##              metric      mean          sd
## pr_auc       pr_auc 0.2505872 0.008995448
## precision precision 0.9011401 0.002211405
## recall       recall 0.9866902 0.000803682
## f1               f1 0.9419749 0.001142619

The results were identical! It appears that the trees are not going deep enough for this distinction to make any difference.

Experiment 2 - Class Weights

# Since we have so few 'yes' values, how does applying class weights to increase the presence of 'yes' values affect PR-AUC, precision, recall, and F1 score? 

run_dt_weighted <- function(train_data, test_data, maxdepth = 5) {
  
  # Compute class weights
  class_counts <- table(train_data$y)
  weights <- ifelse(train_data$y == "yes",
                    as.numeric(class_counts["no"] / class_counts["yes"]),
                    1)
  
  # Train the model
  dt_model <- rpart(y ~ ., data = train_data, method = "class",
                    weights = weights,
                    control = rpart.control(maxdepth = maxdepth, cp = 0.01))
  
  # Predict probabilities
  pred_probs <- predict(dt_model, test_data, type = "prob")[, "yes"]
  
  # PR-AUC
  fg <- pred_probs[test_data$y == "yes"]
  bg <- pred_probs[test_data$y == "no"]
  pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
  
  # Precision, Recall, F1 at threshold 0.5
  pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
  conf_matrix <- confusionMatrix(factor(pred_class, levels=c("no","yes")),
                                 factor(test_data$y, levels=c("no","yes")))
  
  # Return metrics
  list(
    pr_auc = pr$auc.integral,
    precision = conf_matrix$byClass["Precision"],
    recall = conf_matrix$byClass["Recall"],
    f1 = conf_matrix$byClass["F1"]
  )
}

# Run 30 experiments
results_weighted <- lapply(splits_list, function(split) {
  run_dt_weighted(split$train, split$test, maxdepth = 5)
})

# Results
metrics_df <- do.call(rbind, lapply(results_weighted, as.data.frame))
summary_stats <- data.frame(
  metric = colnames(metrics_df),
  mean = apply(metrics_df, 2, mean),
  sd = apply(metrics_df, 2, sd)
)

summary_stats

##              metric      mean          sd
## pr_auc       pr_auc 0.3500296 0.012530847
## precision precision 0.9292679 0.002607405
## recall       recall 0.8646072 0.026364502
## f1               f1 0.8955405 0.013369177

PR-AUC increases as does precision to a lesser extent. Recall and F1 decrease. This is likely due to a decrease in overfitting that the weighting caused. This second model is more generalizable because the predictions are less influenced by the fact that the majority of them are ‘no.’ This is why we see an improvement in overall performance with the increase in PR-AUC, while also seeing a notable drop in recall.

Random Forest

Experiment 1 - Number of Trees

# Does increasing the number of trees improve predictive performance?
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(24601)

# Function to run one Random Forest experiment with arbitrary ntree
run_rf_ntree <- function(train_data, test_data, ntree = 500) {
  
  rf_model <- randomForest(
    y ~ ., data = train_data, ntree = ntree)
  
  
  # Predict probabilities
  pred_probs <- predict(rf_model, test_data, type = "prob")[, "yes"]
  
  # PR-AUC
  fg <- pred_probs[test_data$y == "yes"]
  bg <- pred_probs[test_data$y == "no"]
  pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
  
  # Precision, Recall, F1 at 0.5 threshold
  pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
  conf_matrix <- confusionMatrix(factor(pred_class, levels=c("no","yes")),
                                 factor(test_data$y, levels=c("no","yes")))
  
  list(
    ntree = ntree,
    pr_auc = pr$auc.integral,
    precision = conf_matrix$byClass["Precision"],
    recall = conf_matrix$byClass["Recall"],
    f1 = conf_matrix$byClass["F1"]
  )
}

# Run across 30 splits
ntree_values <- c(100, 500)
results_rf_ntree <- list()

for (nt in ntree_values) {
  results_rf_ntree[[paste0("ntree_", nt)]] <- lapply(splits_list, function(split) {
    run_rf_ntree(split$train, split$test, ntree = nt)
  })
}
# Results
summary_rf_ntree <- lapply(results_rf_ntree, function(res_list) {
  metrics_df <- do.call(rbind, lapply(res_list, as.data.frame))
  data.frame(
    metric = colnames(metrics_df),
    mean = apply(metrics_df, 2, mean),
    sd = apply(metrics_df, 2, sd)
  )
})

summary_rf_ntree

## $ntree_100
##              metric        mean          sd
## ntree         ntree 100.0000000 0.000000000
## pr_auc       pr_auc   0.4344185 0.011471173
## precision precision   0.9076175 0.002754735
## recall       recall   0.9789186 0.001700386
## f1               f1   0.9419169 0.001382787
## 
## $ntree_500
##              metric        mean          sd
## ntree         ntree 500.0000000 0.000000000
## pr_auc       pr_auc   0.4397201 0.010625560
## precision precision   0.9079014 0.002625369
## recall       recall   0.9792888 0.001678334
## f1               f1   0.9422415 0.001323236

Overall, the differences shown between these results are very slight. The PR-AUC increases by about .006. That being said, the differences in computation time did not seem notable either. At 30 iterations, both took considerable amounts of time.

Experiment 2 - Class Weights

# Just as before, can we improve accuracy of our model by applying class weights?
# Function to run one Random Forest experiment
run_rf_weighted <- function(train_data, test_data, ntree = 100) {
  
  # Compute class weights
  class_counts <- table(train_data$y)
  classwt <- as.numeric(class_counts["no"] / class_counts)
  names(classwt) <- names(class_counts)
  
  
  # Train Random Forest
  rf_model <- randomForest(y ~ ., data = train_data, ntree = ntree,
                           classwt = classwt)
  
  # Predict probabilities
  pred_probs <- predict(rf_model, test_data, type = "prob")[, "yes"]
  
  # PR-AUC
  fg <- pred_probs[test_data$y == "yes"]
  bg <- pred_probs[test_data$y == "no"]
  pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
  
  # Precision, Recall, F1 at 0.5
  pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
  conf_matrix <- confusionMatrix(factor(pred_class, levels=c("no","yes")),
                                 factor(test_data$y, levels=c("no","yes")))
  
  list(
    pr_auc = pr$auc.integral,
    precision = conf_matrix$byClass["Precision"],
    recall = conf_matrix$byClass["Recall"],
    f1 = conf_matrix$byClass["F1"]
  )
}

# Run across the 30 splits
results_rf <- lapply(splits_list, function(split) {
  run_rf_weighted(split$train, split$test, ntree = 100)
})

# Results
metrics_rf <- do.call(rbind, lapply(results_rf, as.data.frame))
summary_rf <- data.frame(
  metric = colnames(metrics_rf),
  mean = apply(metrics_rf, 2, mean),
  sd = apply(metrics_rf, 2, sd)
)
summary_rf

##              metric      mean          sd
## pr_auc       pr_auc 0.3362144 0.010147283
## precision precision 0.9117388 0.002408161
## recall       recall 0.9450237 0.003839002
## f1               f1 0.9280761 0.001871955

The most notable difference here is the drop in PR-AUC, which is particularly surprising because the values for precision and recall are fairly similar across the two models, with recall taking a tumble. Possibly the weights did not improve the discrimination of the model because the bootstrapping used in random forest models handles these imbalances well on its own.

AdaBoost

Experiment 1 - Class Weights

# Class weights helped a lot in decision trees, but actually hurt model performance using random forests. How does weighting impact the precision and recall when using AdaBoost?
library(adabag)

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: doParallel

## Loading required package: iterators

## Loading required package: parallel

set.seed(24601)
train_index <- createDataPartition(data$y, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data  <- data[-train_index, ]

# Compute class weights (used for weighted AdaBoost)
class_counts <- table(train_data$y)
class_weights <- c(
  "no" = 1,
  "yes" = as.numeric(class_counts["no"] / class_counts["yes"])
)
n_runs <- 30
results_list <- vector("list", n_runs * 2)  # twice as many for baseline + weighted

for (i in 1:n_runs) {
  
  # Ensure factor levels match
  cat_cols <- names(train_data)[sapply(train_data, is.factor)]
  for (col in cat_cols) {
    test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
  }
  # Baseline AdaBoost
  ab_baseline <- boosting(
    y ~ ., 
    data = train_data,        
    mfinal = 50,
    
    control = rpart.control(maxdepth = 3)
  )
  
  pred_ab <- predict.boosting(ab_baseline, newdata = test_data)
  if (is.null(dimnames(pred_ab$prob))) colnames(pred_ab$prob) <- levels(train_data$y)
  pred_probs <- pred_ab$prob[, "yes"]
  
  fg <- pred_probs[test_data$y == "yes"]
  bg <- pred_probs[test_data$y == "no"]
  pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
  
  pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
  cm <- confusionMatrix(factor(pred_class, levels = c("no", "yes")),
                        factor(test_data$y, levels = c("no", "yes")))
  
  results_list[[i]] <- list(
    experiment = "baseline",
    pr_auc = pr$auc.integral,
    precision = cm$byClass["Precision"],
    recall = cm$byClass["Recall"],
    f1 = cm$byClass["F1"]
  )
  # AdaBoost with Weights
  case_weights <- ifelse(train_data$y == "yes", class_weights["yes"], class_weights["no"])
  
  ab_weighted <- boosting(
    y ~ .,
    data = train_data,
    mfinal = 50,
    control = rpart.control(maxdepth = 3),
    weights = case_weights,
    boos = TRUE,
    coeflearn = "Zhu"
  )
  
  pred_ab <- predict.boosting(ab_weighted, newdata = test_data)
  if (is.null(dimnames(pred_ab$prob))) colnames(pred_ab$prob) <- levels(train_data$y)
  pred_probs <- pred_ab$prob[, "yes"]
  
  fg <- pred_probs[test_data$y == "yes"]
  bg <- pred_probs[test_data$y == "no"]
  pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
  
  pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
  cm <- confusionMatrix(factor(pred_class, levels = c("no", "yes")),
                        factor(test_data$y, levels = c("no", "yes")))
  
  results_list[[n_runs + i]] <- list(
    experiment = "weighted",
    pr_auc = pr$auc.integral,
    precision = cm$byClass["Precision"],
    recall = cm$byClass["Recall"],
    f1 = cm$byClass["F1"]
  )
  
}

# Combine and summarize results
results_df <- bind_rows(lapply(results_list, as.data.frame))

summary_ab <- results_df %>%
  group_by(experiment) %>%
  summarize(
    mean_pr_auc = mean(pr_auc),
    sd_pr_auc   = sd(pr_auc),
    mean_precision = mean(precision),
    sd_precision   = sd(precision),
    mean_recall = mean(recall),
    sd_recall   = sd(recall),
    mean_f1    = mean(f1),
    sd_f1      = sd(f1)
  )

summary_ab

## # A tibble: 2 × 9
##   experiment mean_pr_auc sd_pr_auc mean_precision sd_precision mean_recall
##   <chr>            <dbl>     <dbl>          <dbl>        <dbl>       <dbl>
## 1 baseline         0.428   0.00373          0.903     0.000961       0.983
## 2 weighted         0.429   0.00547          0.905     0.000749       0.982
## # ℹ 3 more variables: sd_recall <dbl>, mean_f1 <dbl>, sd_f1 <dbl>

Here, the results are much closer, and we see a slight increase in PR AUC when applying weights. This makes sense given how AdaBoost operates, focusing on mislabeled items after each iteration. This already handles unbalanced data so the weighting is generally superfluous.

Experiment 2 - Number of Trees

# Will a greater number of trees increase PR AUC?
tree_counts <- c(100, 200)

results_list <- list()
for (nt in tree_counts) {
  for (i in 1:n_runs) {
    
    # Ensure factor levels match
    cat_cols <- names(train_data)[sapply(train_data, is.factor)]
    for (col in cat_cols) {
      test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
    }
    
    # Train AdaBoost
    ab_model <- boosting(
      y ~ .,
      data = train_data,
      mfinal = nt,
      control = rpart.control(maxdepth = 3),  
      boos = TRUE,
      coeflearn = "Zhu"               
    )
    
    pred_ab <- predict.boosting(ab_model, newdata = test_data)
    
    # Ensure probability matrix has column names
    if (is.null(dimnames(pred_ab$prob))) {
      colnames(pred_ab$prob) <- levels(train_data$y)
    }
    
    pred_probs <- pred_ab$prob[, "yes"]
    
    # PR-AUC
    fg <- pred_probs[test_data$y == "yes"]
    bg <- pred_probs[test_data$y == "no"]
    pr <- pr.curve(scores.class0 = fg, scores.class1 = bg, curve = FALSE)
    
    # Threshold 0.5 for confusion matrix
    pred_class <- ifelse(pred_probs > 0.5, "yes", "no")
    cm <- confusionMatrix(factor(pred_class, levels = c("no","yes")),
                          factor(test_data$y, levels = c("no","yes")))
    
    # Save 
    results_list[[paste0("ntree_", nt, "_run_", i)]] <- list(
      mfinal = nt,
      pr_auc = pr$auc.integral,
      precision = cm$byClass["Precision"],
      recall = cm$byClass["Recall"],
      f1 = cm$byClass["F1"]
    )
    
  }
}

# Combine into a single dataframe
results_df <- bind_rows(lapply(results_list, as.data.frame))

# Results
summary_ab_ntrees <- results_df %>%
  group_by(mfinal) %>%
  summarize(
    mean_pr_auc = mean(pr_auc),
    sd_pr_auc   = sd(pr_auc),
    mean_precision = mean(precision),
    sd_precision   = sd(precision),
    mean_recall = mean(recall),
    sd_recall   = sd(recall),
    mean_f1    = mean(f1),
    sd_f1      = sd(f1)
  )

summary_ab_ntrees

## # A tibble: 2 × 9
##   mfinal mean_pr_auc sd_pr_auc mean_precision sd_precision mean_recall sd_recall
##    <dbl>       <dbl>     <dbl>          <dbl>        <dbl>       <dbl>     <dbl>
## 1    100       0.433   0.00563          0.905      0.00168       0.983   0.00209
## 2    200       0.441   0.00480          0.905      0.00119       0.983   0.00145
## # ℹ 2 more variables: mean_f1 <dbl>, sd_f1 <dbl>

Experimenting with the number of trees showed a small increase in PR-AUC, indicating marginally improved probability ranking, while other metrics remained stable.