Introduction

This RMD supports Assignment 02 for CUNY SPS DATA 622, where we begin to build a model with a dataset from a Portuguese bank’s marketing campaign. The bank used phone calls to predict whether clients would subscribe to a term deposit. The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y) by applying machine learning techniques to analyze the data and uncover the most effective strategies for boosting customer subscriptions in future campaigns. Three algorithms will be used, two times each - Decision Trees, Random Forest, and Adaboost.

Load Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(adabag)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(ada)

Data Prep

# Read csv from github
bank_df <- read.csv("https://raw.githubusercontent.com/evanskaylie/DATA622/refs/heads/main/bank-full.csv", sep = ";")

# Change character cols to factors
character_columns <- sapply(bank_df, is.character)
bank_df[character_columns] <- lapply(bank_df[character_columns], as.factor)

# Check the data
head(bank_df)
##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no unknown   5
## 2  44   technician  single secondary      no      29     yes   no unknown   5
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
## 5  33      unknown  single   unknown      no       1      no   no unknown   5
## 6  35   management married  tertiary      no     231     yes   no unknown   5
##   month duration campaign pdays previous poutcome  y
## 1   may      261        1    -1        0  unknown no
## 2   may      151        1    -1        0  unknown no
## 3   may       76        1    -1        0  unknown no
## 4   may       92        1    -1        0  unknown no
## 5   may      198        1    -1        0  unknown no
## 6   may      139        1    -1        0  unknown no
summary(bank_df)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 6247   Median : 180.0   Median : 2.000  
##  Mean   :15.81   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##                  (Other): 6060                                    
##      pdays          previous           poutcome       y        
##  Min.   : -1.0   Min.   :  0.0000   failure: 4901   no :39922  
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840   yes: 5289  
##  Median : -1.0   Median :  0.0000   success: 1511              
##  Mean   : 40.2   Mean   :  0.5803   unknown:36959              
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                              
##  Max.   :871.0   Max.   :275.0000                              
## 
# Initialize matrix to store metrics from each experiment
results_matrix <- matrix(NA, nrow = 0, ncol = 6) 
colnames(results_matrix) <- c("Model",
                              "Precision",
                              "Accuracy", 
                              "Recall", 
                              "F1", 
                              "AUC")

Experiment 1/6 Decision Trees

Define the objective of the experiment (hypothesis)

  • The objective of this experiment is to train a decision tree on the bank marketing dataset without any pre-processing (such as pruning or resampling) and see how it performs with default settings. We aim to assess how well a decision tree can predict whether a client subscribes to a term deposit based on the given features.

  • A decision tree model can identify meaningful patterns in the data and provide a good baseline for classification, but it might overfit due to the dataset’s complexity and the presence of outliers.

Decide what will change, and what will stay the same

  • Starting, so delta to measure yet. The target variable is “y.”

Select the evaluation metric (what you want to measure)

  • For all 6 experiments, the evaluation metrics are Precision, Accuracy, Recall, F1, and AUC

Perform the experiment

  • See below

Document the experiment so you compare results (track progress)

  • See below
# Set seed for reproducibility
set.seed(64)

# Split the data into training (80%) and testing (20%) sets
exp1_trainIndex <- createDataPartition(bank_df$y, p = 0.8, list = FALSE)
exp1_train <- bank_df[exp1_trainIndex, ]
exp1_test <- bank_df[-exp1_trainIndex, ]

# Build the decision tree model using the training data
exp1_model <- rpart(y ~ ., data = exp1_train, method = "class")

# Check the model's summary
summary(exp1_model)
## Call:
## rpart(formula = y ~ ., data = exp1_train, method = "class")
##   n= 36170 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.03544423      0 1.0000000 1.0000000 0.01444464
## 2 0.02859168      3 0.8936673 0.9005198 0.01379740
## 3 0.01559546      4 0.8650756 0.8731096 0.01361012
## 4 0.01000000      5 0.8494802 0.8565690 0.01349510
## 
## Variable importance
## duration poutcome 
##       63       37 
## 
## Node number 1: 36170 observations,    complexity param=0.03544423
##   predicted class=no   expected loss=0.117003  P(node) =1
##     class counts: 31938  4232
##    probabilities: 0.883 0.117 
##   left son=2 (32119 obs) right son=3 (4051 obs)
##   Primary splits:
##       duration < 521.5   to the left,   improve=941.0716, (0 missing)
##       poutcome splits as  LLRL,         improve=679.7777, (0 missing)
##       month    splits as  LLRLLLLRLLRR, improve=400.0282, (0 missing)
##       pdays    < 8.5     to the left,   improve=210.9907, (0 missing)
##       previous < 0.5     to the left,   improve=208.1616, (0 missing)
## 
## Node number 2: 32119 observations,    complexity param=0.03544423
##   predicted class=no   expected loss=0.07649678  P(node) =0.8880011
##     class counts: 29662  2457
##    probabilities: 0.924 0.076 
##   left son=4 (31069 obs) right son=5 (1050 obs)
##   Primary splits:
##       poutcome splits as  LLRL,         improve=614.6094, (0 missing)
##       month    splits as  LLRLLLLRLLRR, improve=395.9786, (0 missing)
##       pdays    < 16      to the left,   improve=202.3223, (0 missing)
##       previous < 0.5     to the left,   improve=199.7756, (0 missing)
##       duration < 205.5   to the left,   improve=172.3224, (0 missing)
## 
## Node number 3: 4051 observations,    complexity param=0.03544423
##   predicted class=no   expected loss=0.4381634  P(node) =0.1119989
##     class counts:  2276  1775
##    probabilities: 0.562 0.438 
##   left son=6 (2505 obs) right son=7 (1546 obs)
##   Primary splits:
##       duration < 807.5   to the left,   improve=89.29620, (0 missing)
##       contact  splits as  RRL,          improve=49.47882, (0 missing)
##       poutcome splits as  LLRL,         improve=48.34208, (0 missing)
##       month    splits as  LRRLLLLRLLRR, improve=24.81625, (0 missing)
##       marital  splits as  RLR,          improve=21.53750, (0 missing)
##   Surrogate splits:
##       campaign < 23      to the left,  agree=0.619, adj=0.003, (0 split)
##       previous < 14.5    to the left,  agree=0.619, adj=0.002, (0 split)
##       balance  < -2385.5 to the right, agree=0.619, adj=0.001, (0 split)
##       pdays    < 388.5   to the left,  agree=0.619, adj=0.001, (0 split)
## 
## Node number 4: 31069 observations
##   predicted class=no   expected loss=0.05851492  P(node) =0.8589715
##     class counts: 29251  1818
##    probabilities: 0.941 0.059 
## 
## Node number 5: 1050 observations,    complexity param=0.02859168
##   predicted class=yes  expected loss=0.3914286  P(node) =0.02902958
##     class counts:   411   639
##    probabilities: 0.391 0.609 
##   left son=10 (301 obs) right son=11 (749 obs)
##   Primary splits:
##       duration < 162.5   to the left,   improve=80.875390, (0 missing)
##       housing  splits as  RL,           improve=14.295600, (0 missing)
##       month    splits as  RRRRLRRRLLRR, improve=14.211940, (0 missing)
##       pdays    < 197.5   to the right,  improve= 8.737133, (0 missing)
##       job      splits as  LLLLRRRLRLRR, improve= 7.710840, (0 missing)
##   Surrogate splits:
##       contact  splits as  RRL,         agree=0.720, adj=0.023, (0 split)
##       campaign < 6.5     to the right, agree=0.719, adj=0.020, (0 split)
##       pdays    < 2.5     to the left,  agree=0.716, adj=0.010, (0 split)
##       default  splits as  RL,          agree=0.714, adj=0.003, (0 split)
## 
## Node number 6: 2505 observations,    complexity param=0.01559546
##   predicted class=no   expected loss=0.3556886  P(node) =0.06925629
##     class counts:  1614   891
##    probabilities: 0.644 0.356 
##   left son=12 (2397 obs) right son=13 (108 obs)
##   Primary splits:
##       poutcome splits as  LLRL,         improve=45.68373, (0 missing)
##       contact  splits as  RRL,          improve=35.80869, (0 missing)
##       month    splits as  LRRLLLLRLLRR, improve=22.05744, (0 missing)
##       pdays    < 0       to the left,   improve=21.42791, (0 missing)
##       previous < 0.5     to the left,   improve=21.42791, (0 missing)
## 
## Node number 7: 1546 observations
##   predicted class=yes  expected loss=0.4282018  P(node) =0.0427426
##     class counts:   662   884
##    probabilities: 0.428 0.572 
## 
## Node number 10: 301 observations
##   predicted class=no   expected loss=0.2990033  P(node) =0.008321814
##     class counts:   211    90
##    probabilities: 0.701 0.299 
## 
## Node number 11: 749 observations
##   predicted class=yes  expected loss=0.2670227  P(node) =0.02070777
##     class counts:   200   549
##    probabilities: 0.267 0.733 
## 
## Node number 12: 2397 observations
##   predicted class=no   expected loss=0.3354193  P(node) =0.06627039
##     class counts:  1593   804
##    probabilities: 0.665 0.335 
## 
## Node number 13: 108 observations
##   predicted class=yes  expected loss=0.1944444  P(node) =0.0029859
##     class counts:    21    87
##    probabilities: 0.194 0.806
# Plot the decision tree with adjusted palette and text size
rpart.plot(exp1_model,
           main = "Decision Tree for Predicting Subscription",
           extra = 104,
           type = 3,
           fallen.leaves = TRUE,
           box.palette = c("lightgray", "lightblue"),
           cex = 0.8)

# Ensure both the actual and predicted values have the same factor levels
exp1_predictions <- predict(exp1_model, 
                            newdata = exp1_test, 
                            type = "class")

# Make sure exp1_test$Y has the same factor levels as the training data's y variable
exp1_test$y <- factor(exp1_test$y, levels = levels(exp1_train$y))

# Confusion Matrix
exp1_cm <- confusionMatrix(exp1_predictions, 
                            exp1_test$y)

# Probability predictions for AUC calculation
exp1_probabilities <- predict(exp1_model, 
                               newdata = exp1_test, 
                               type = "prob")[, 2]

# AUC calculation
exp1_auc <- auc(roc(exp1_test$y, 
                     exp1_probabilities))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
results_matrix <- rbind(results_matrix, 
                        c("1 Decision Tree",
                          exp1_cm$overall["Accuracy"],
                          exp1_cm$byClass["Precision"],
                          exp1_cm$byClass["Recall"],
                          exp1_cm$byClass["F1"],
                          exp1_auc))

# Show results matrix
results_matrix
##      Model             Precision           Accuracy           
## [1,] "1 Decision Tree" "0.902333812631346" "0.919036940871002"
##      Recall              F1                  AUC                
## [1,] "0.975325651302605" "0.946345020356079" "0.741402625496973"

Experiment 2/6 Decision Trees with Balanced y

Define the objective of the experiment (hypothesis)

  • The objective of this experiment is to assess the impact of balancing the target variable (y) before training a decision tree. Since the dataset may have an imbalance between “yes” and “no” responses, balancing the classes should help the model learn more equally from both outcomes.

  • Balancing the y variable will improve classification performance, particularly in recall and F1-score, as the model will no longer be biased toward the majority class. However, accuracy may decrease if the model starts misclassifying more “no” cases.

Decide what will change, and what will stay the same

  • With experiment 2, there will be the same target variable, features, and lack of pruning. The difference is the target variable (y) will be balanced before training. The dataset will be resampled to have an equal number of “yes” and “no” cases, by undersampling the majority class.

Select the evaluation metric (what you want to measure)

  • For all 6 experiments, the evaluation metrics are Precision, Accuracy, Recall, F1, and AUC

Perform the experiment

  • See below

Document the experiment so you compare results (track progress)

  • See below
# Set seed for reproducibility
set.seed(64)

# Balance the dataset by undersampling the majority class
exp2_minority_class <- bank_df[bank_df$y == "yes", ]
exp2_majority_class <- bank_df[bank_df$y == "no", ]

# Randomly sample from the majority class to match the minority class size
exp2_majority_class_sampled <- exp2_majority_class[sample(nrow(exp2_majority_class), nrow(exp2_minority_class)), ]

# Combine the balanced dataset
exp2_balanced_df <- rbind(exp2_minority_class, exp2_majority_class_sampled)

# Split the balanced data into training (80%) and testing (20%) sets
exp2_trainIndex <- createDataPartition(exp2_balanced_df$y, p = 0.8, list = FALSE)
exp2_train <- exp2_balanced_df[exp2_trainIndex, ]
exp2_test <- exp2_balanced_df[-exp2_trainIndex, ]

# Build the decision tree model using the balanced training data
exp2_model <- rpart(y ~ ., data = exp2_train, method = "class")

# Check the model's summary
summary(exp2_model)
## Call:
## rpart(formula = y ~ ., data = exp2_train, method = "class")
##   n= 8464 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.44258034      0 1.0000000 1.0222117 0.010866884
## 2 0.04383270      1 0.5574197 0.5595463 0.009758428
## 3 0.04040643      3 0.4697543 0.4704631 0.009220510
## 4 0.01591052      4 0.4293478 0.4470699 0.009056823
## 5 0.01000000      7 0.3816163 0.3799622 0.008527956
## 
## Variable importance
## duration poutcome  contact    month  housing campaign      job 
##       58       16       13        7        3        1        1 
## 
## Node number 1: 8464 observations,    complexity param=0.4425803
##   predicted class=no   expected loss=0.5  P(node) =1
##     class counts:  4232  4232
##    probabilities: 0.500 0.500 
##   left son=2 (3411 obs) right son=3 (5053 obs)
##   Primary splits:
##       duration < 210.5   to the left,   improve=861.3709, (0 missing)
##       poutcome splits as  LLRL,         improve=353.2212, (0 missing)
##       contact  splits as  RRL,          improve=300.7854, (0 missing)
##       month    splits as  RLRLLLLRLLRR, improve=264.6980, (0 missing)
##       pdays    < 8.5     to the left,   improve=233.1856, (0 missing)
##   Surrogate splits:
##       campaign < 5.5     to the right,  agree=0.604, adj=0.017, (0 split)
##       balance  < -665.5  to the left,   agree=0.598, adj=0.003, (0 split)
##       job      splits as  RRRLRRRRRRRR, agree=0.598, adj=0.002, (0 split)
##       day      < 30.5    to the right,  agree=0.598, adj=0.002, (0 split)
##       age      < 18.5    to the left,   agree=0.597, adj=0.001, (0 split)
## 
## Node number 2: 3411 observations,    complexity param=0.04040643
##   predicted class=no   expected loss=0.2254471  P(node) =0.4030009
##     class counts:  2642   769
##    probabilities: 0.775 0.225 
##   left son=4 (3178 obs) right son=5 (233 obs)
##   Primary splits:
##       poutcome splits as  LLRL,         improve=205.8329, (0 missing)
##       month    splits as  RLRRLLLRLLRR, improve=203.5423, (0 missing)
##       pdays    < 9.5     to the left,   improve=111.1107, (0 missing)
##       previous < 0.5     to the left,   improve=107.8845, (0 missing)
##       duration < 129.5   to the left,   improve=102.0471, (0 missing)
## 
## Node number 3: 5053 observations,    complexity param=0.0438327
##   predicted class=yes  expected loss=0.3146646  P(node) =0.5969991
##     class counts:  1590  3463
##    probabilities: 0.315 0.685 
##   left son=6 (2855 obs) right son=7 (2198 obs)
##   Primary splits:
##       duration < 493.5   to the left,   improve=194.61970, (0 missing)
##       contact  splits as  RRL,          improve=160.92140, (0 missing)
##       poutcome splits as  LLRL,         improve=106.25500, (0 missing)
##       housing  splits as  RL,           improve= 85.76641, (0 missing)
##       month    splits as  LLRLLLLRLLRR, improve= 81.93030, (0 missing)
##   Surrogate splits:
##       month    splits as  LLLLLRLLRLLL, agree=0.576, adj=0.026, (0 split)
##       balance  < -6      to the right,  agree=0.573, adj=0.019, (0 split)
##       campaign < 5.5     to the left,   agree=0.572, adj=0.016, (0 split)
##       job      splits as  LLRLLLLLLLLL, agree=0.567, adj=0.005, (0 split)
##       default  splits as  LR,           agree=0.567, adj=0.004, (0 split)
## 
## Node number 4: 3178 observations
##   predicted class=no   expected loss=0.1784141  P(node) =0.3754726
##     class counts:  2611   567
##    probabilities: 0.822 0.178 
## 
## Node number 5: 233 observations
##   predicted class=yes  expected loss=0.1330472  P(node) =0.02752836
##     class counts:    31   202
##    probabilities: 0.133 0.867 
## 
## Node number 6: 2855 observations,    complexity param=0.0438327
##   predicted class=yes  expected loss=0.4364273  P(node) =0.337311
##     class counts:  1246  1609
##    probabilities: 0.436 0.564 
##   left son=12 (451 obs) right son=13 (2404 obs)
##   Primary splits:
##       contact  splits as  RRL,         improve=241.5725, (0 missing)
##       housing  splits as  RL,          improve=168.1001, (0 missing)
##       poutcome splits as  LRRL,        improve=162.6759, (0 missing)
##       pdays    < 9       to the left,  improve=158.2667, (0 missing)
##       previous < 0.5     to the left,  improve=157.5695, (0 missing)
##   Surrogate splits:
##       balance  < -1381   to the left,  agree=0.843, adj=0.004, (0 split)
##       campaign < 17.5    to the right, agree=0.842, adj=0.002, (0 split)
## 
## Node number 7: 2198 observations
##   predicted class=yes  expected loss=0.1565059  P(node) =0.2596881
##     class counts:   344  1854
##    probabilities: 0.157 0.843 
## 
## Node number 12: 451 observations
##   predicted class=no   expected loss=0.0886918  P(node) =0.0532845
##     class counts:   411    40
##    probabilities: 0.911 0.089 
## 
## Node number 13: 2404 observations,    complexity param=0.01591052
##   predicted class=yes  expected loss=0.3473378  P(node) =0.2840265
##     class counts:   835  1569
##    probabilities: 0.347 0.653 
##   left son=26 (1833 obs) right son=27 (571 obs)
##   Primary splits:
##       month    splits as  LLRLLLRRLLRR, improve=113.70730, (0 missing)
##       poutcome splits as  LLRL,         improve= 99.09282, (0 missing)
##       housing  splits as  RL,           improve= 86.68149, (0 missing)
##       pdays    < 9       to the left,   improve= 67.32712, (0 missing)
##       previous < 0.5     to the left,   improve= 66.63591, (0 missing)
##   Surrogate splits:
##       day   < 1.5     to the right, agree=0.772, adj=0.040, (0 split)
##       pdays < 570     to the left,  agree=0.764, adj=0.007, (0 split)
##       age   < 86.5    to the left,  agree=0.763, adj=0.004, (0 split)
## 
## Node number 26: 1833 observations,    complexity param=0.01591052
##   predicted class=yes  expected loss=0.4331697  P(node) =0.2165643
##     class counts:   794  1039
##    probabilities: 0.433 0.567 
##   left son=52 (1569 obs) right son=53 (264 obs)
##   Primary splits:
##       poutcome splits as  LLRL,         improve=85.61982, (0 missing)
##       housing  splits as  RL,           improve=70.11674, (0 missing)
##       pdays    < 9       to the left,   improve=49.97514, (0 missing)
##       previous < 0.5     to the left,   improve=49.57244, (0 missing)
##       job      splits as  RLLLRRLLRRRR, improve=38.62509, (0 missing)
##   Surrogate splits:
##       balance < 25986.5 to the left,  agree=0.857, adj=0.004, (0 split)
## 
## Node number 27: 571 observations
##   predicted class=yes  expected loss=0.07180385  P(node) =0.06746219
##     class counts:    41   530
##    probabilities: 0.072 0.928 
## 
## Node number 52: 1569 observations,    complexity param=0.01591052
##   predicted class=yes  expected loss=0.4958572  P(node) =0.1853733
##     class counts:   778   791
##    probabilities: 0.496 0.504 
##   left son=104 (654 obs) right son=105 (915 obs)
##   Primary splits:
##       housing  splits as  RL,           improve=56.40143, (0 missing)
##       age      < 60.5    to the left,   improve=33.74642, (0 missing)
##       job      splits as  RLLRRRLLRRRR, improve=33.34551, (0 missing)
##       duration < 346.5   to the left,   improve=31.11770, (0 missing)
##       pdays    < 372     to the left,   improve=20.78586, (0 missing)
##   Surrogate splits:
##       month    splits as  RR-RRR--LR--, agree=0.656, adj=0.174, (0 split)
##       job      splits as  RLRRRRRLRRRR, agree=0.649, adj=0.157, (0 split)
##       pdays    < 240.5   to the right,  agree=0.628, adj=0.109, (0 split)
##       poutcome splits as  LR-R,         agree=0.604, adj=0.050, (0 split)
##       previous < 0.5     to the right,  agree=0.602, adj=0.046, (0 split)
## 
## Node number 53: 264 observations
##   predicted class=yes  expected loss=0.06060606  P(node) =0.03119093
##     class counts:    16   248
##    probabilities: 0.061 0.939 
## 
## Node number 104: 654 observations
##   predicted class=no   expected loss=0.3455657  P(node) =0.07726843
##     class counts:   428   226
##    probabilities: 0.654 0.346 
## 
## Node number 105: 915 observations
##   predicted class=yes  expected loss=0.3825137  P(node) =0.1081049
##     class counts:   350   565
##    probabilities: 0.383 0.617
# Plot the decision tree with adjusted palette and text size
rpart.plot(exp2_model,
           main = "Decision Tree with Balanced y",
           extra = 104,
           type = 3,
           fallen.leaves = TRUE,
           box.palette = c("lightgray", "lightblue"),
           cex = 0.8)

# Make predictions
exp2_predictions <- predict(exp2_model, 
                            newdata = exp2_test, 
                            type = "class")

# Ensure factor levels are consistent
exp2_test$y <- factor(exp2_test$y, levels = levels(exp2_train$y))

# Confusion Matrix
exp2_cm <- confusionMatrix(exp2_predictions, 
                            exp2_test$y)

# Probability predictions for AUC calculation
exp2_probabilities <- predict(exp2_model, 
                               newdata = exp2_test, 
                               type = "prob")[, 2]

# AUC calculation
exp2_auc <- auc(roc(exp2_test$y, 
                     exp2_probabilities))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
results_matrix <- rbind(results_matrix, 
                        c("2 Decision Tree Balanced",
                          exp2_cm$overall["Accuracy"],
                          exp2_cm$byClass["Precision"],
                          exp2_cm$byClass["Recall"],
                          exp2_cm$byClass["F1"],
                          exp2_auc))

# Show results matrix
results_matrix
##      Model                      Precision           Accuracy           
## [1,] "1 Decision Tree"          "0.902333812631346" "0.919036940871002"
## [2,] "2 Decision Tree Balanced" "0.809366130558184" "0.798357664233577"
##      Recall              F1                  AUC                
## [1,] "0.975325651302605" "0.946345020356079" "0.741402625496973"
## [2,] "0.827814569536424" "0.812819321876451" "0.843501761916994"

Experiment 3/6 Random Forest

Define the objective of the experiment (hypothesis)

  • The random forest is expected to improve generalization and reduce overfitting while increasing accuracy and F1-score.

Decide what will change, and what will stay the same

  • With experiment 2, there will be the same target variable and features. The dataset will remain unbalanced to isolate the effect of bagging alone. Bagging (bootstrap sampling) will be applied to train multiple trees.

Select the evaluation metric (what you want to measure)

  • For all 6 experiments, the evaluation metrics are Precision, Accuracy, Recall, F1, and AUC

Perform the experiment

  • See below

Document the experiment so you compare results (track progress)

  • See below
# Set seed for reproducibility
set.seed(64)

# Split the original (unbalanced) dataset into training (80%) and testing (20%) sets
exp3_trainIndex <- createDataPartition(bank_df$y, p = 0.8, list = FALSE)
exp3_train <- bank_df[exp3_trainIndex, ]
exp3_test <- bank_df[-exp3_trainIndex, ]

# Train a random forest model with default parameters
exp3_model <- randomForest(y ~ ., data = exp3_train)

# Check model summary
print(exp3_model)
## 
## Call:
##  randomForest(formula = y ~ ., data = exp3_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 9.38%
## Confusion matrix:
##        no  yes class.error
## no  30759 1179  0.03691527
## yes  2213 2019  0.52292060
# Make predictions on the test set
exp3_predictions <- predict(exp3_model, newdata = exp3_test)

# Ensure factor levels are consistent
exp3_test$y <- factor(exp3_test$y, levels = levels(exp3_train$y))

# Confusion Matrix
exp3_cm <- confusionMatrix(exp3_predictions, exp3_test$y)

# Probability predictions for AUC calculation
exp3_probabilities <- predict(exp3_model, newdata = exp3_test, type = "prob")[, 2]

# AUC calculation
exp3_auc <- auc(roc(exp3_test$y, exp3_probabilities))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add results to the matrix
results_matrix <- rbind(results_matrix, 
                        c("3 Random Forest",
                          exp3_cm$overall["Accuracy"],
                          exp3_cm$byClass["Precision"],
                          exp3_cm$byClass["Recall"],
                          exp3_cm$byClass["F1"],
                          exp3_auc))

# Show results matrix
results_matrix
##      Model                      Precision           Accuracy           
## [1,] "1 Decision Tree"          "0.902333812631346" "0.919036940871002"
## [2,] "2 Decision Tree Balanced" "0.809366130558184" "0.798357664233577"
## [3,] "3 Random Forest"          "0.911624820263245" "0.936141799198737"
##      Recall              F1                  AUC                
## [1,] "0.975325651302605" "0.946345020356079" "0.741402625496973"
## [2,] "0.827814569536424" "0.812819321876451" "0.843501761916994"
## [3,] "0.965806613226453" "0.950742864188398" "0.937082004595757"

Experiment 4/6 Random Forest with Hyperparameter Tuning

Define the objective of the experiment (hypothesis)

  • The random forest with hyperparameter tuning is expected to improve generalization and accuracy than in experiment 3.

Decide what will change, and what will stay the same

  • Compared to experiment 3, the dataset and core model are not changing. The mtry (number of features randomly selected at each split) will be optimized using random search. The model will undergo 3-fold cross-validation to assess different parameter settings.

Select the evaluation metric (what you want to measure)

  • For all 6 experiments, the evaluation metrics are Precision, Accuracy, Recall, F1, and AUC

Perform the experiment

  • See below

Document the experiment so you compare results (track progress)

  • See below
# Set seed for reproducibility
set.seed(64)

# Sample 50% of the training data, maintaining class balance
exp4_train_sampled <- exp3_train |>
  group_by(y) |>
  sample_frac(0.5) |>
  ungroup()

# Define hyperparameter search space
exp4_tune_grid <- expand.grid(mtry = sample(3:7, 3))

# Define train control using 3-fold cross-validation
exp4_train_control <- trainControl(method = "cv", number = 3, search = "random")

# Train the random forest model with hyperparameter tuning on the sampled data
exp4_model <- train(y ~ ., 
                    data = exp4_train_sampled, 
                    method = "rf", 
                    trControl = exp4_train_control,
                    tuneGrid = exp4_tune_grid)

# Print best parameters
print(exp4_model$bestTune)
##   mtry
## 3    7
# Plot model performance
plot(exp4_model)

# Make predictions on test data
exp4_predictions <- predict(exp4_model, newdata = exp3_test)

# Ensure factor levels are consistent
exp3_test$y <- factor(exp3_test$y, levels = levels(exp3_train$y))

# Confusion Matrix
exp4_cm <- confusionMatrix(exp4_predictions, exp3_test$y)

# Probability predictions for AUC calculation
exp4_probabilities <- predict(exp4_model, newdata = exp3_test, type = "prob")[, 2]

# AUC calculation
exp4_auc <- auc(roc(exp3_test$y, exp4_probabilities))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
results_matrix <- rbind(results_matrix, 
                        c("4 Random Forest Tuned",
                          exp4_cm$overall["Accuracy"],
                          exp4_cm$byClass["Precision"],
                          exp4_cm$byClass["Recall"],
                          exp4_cm$byClass["F1"],
                          exp4_auc))

# Show results matrix
results_matrix
##      Model                      Precision           Accuracy           
## [1,] "1 Decision Tree"          "0.902333812631346" "0.919036940871002"
## [2,] "2 Decision Tree Balanced" "0.809366130558184" "0.798357664233577"
## [3,] "3 Random Forest"          "0.911624820263245" "0.936141799198737"
## [4,] "4 Random Forest Tuned"    "0.906868709213583" "0.924310836501901"
##      Recall              F1                  AUC                
## [1,] "0.975325651302605" "0.946345020356079" "0.741402625496973"
## [2,] "0.827814569536424" "0.812819321876451" "0.843501761916994"
## [3,] "0.965806613226453" "0.950742864188398" "0.937082004595757"
## [4,] "0.974323647294589" "0.948658536585366" "0.928571961804403"

Experiment 5/6 AdaBoost

Define the objective of the experiment (hypothesis)

  • The hypothesis is that Adaboost will improve the model’s performance compared to a single decision tree by focusing on the mistakes of previous models, leading to better accuracy, precision, and recall, especially in class-imbalanced settings.

Decide what will change, and what will stay the same

  • The decision tree model is replaced with an Adaboost model and use boosting instead of bagging. Hyperparameter tuning will not be applied at this stage to keep things simple and isolate the effect of boosting.

Select the evaluation metric (what you want to measure)

  • For all 6 experiments, the evaluation metrics are Precision, Accuracy, Recall, F1, and AUC

Perform the experiment

  • See below

Document the experiment so you compare results (track progress)

  • See below
# Set seed for reproducibility
set.seed(64)

# Split the data into training and testing sets
exp5_trainIndex <- createDataPartition(bank_df$y, p = 0.8, list = FALSE)
exp5_train <- bank_df[exp5_trainIndex, ]
exp5_test <- bank_df[-exp5_trainIndex, ]

# Train the AdaBoost model using the adabag package
exp5_model_adaboost <- boosting(y ~ ., data = exp5_train, mfinal = 50)

# Print the model summary
# print(exp5_model_adaboost)

# Predict the classes
exp5_predictions <- predict(exp5_model_adaboost, newdata = exp5_test)

# Ensure that 'y' in both testData and predictions has the same factor levels
exp5_predictions$class <- factor(exp5_predictions$class, levels = levels(exp5_train$y))

# Confusion Matrix
exp5_confMat <- confusionMatrix(exp5_predictions$class, exp5_test$y)
# Predict the classes
exp5_predictions_ada <- predict(exp5_model_adaboost, newdata = exp5_test)

# Ensure that 'y' in both testData and predictions has the same factor levels
exp5_predictions_ada$class <- factor(exp5_predictions_ada$class, levels = levels(exp5_train$y))

# Confusion Matrix
exp5_cm_ada <- confusionMatrix(exp5_predictions$class, exp5_test$y)

# Probability predictions for AUC calculation
exp5_probabilities_ada <- predict(exp5_model_adaboost, newdata = exp5_test, type = "prob")$prob[,2]

# AUC calculation
exp5_auc_ada <- auc(roc(exp5_test$y, exp5_probabilities_ada))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Store results in the matrix
results_matrix <- rbind(results_matrix, 
                        c("5 Adaboost (adabag)",
                          exp5_cm_ada$overall["Accuracy"],
                          exp5_cm_ada$byClass["Precision"],
                          exp5_cm_ada$byClass["Recall"],
                          exp5_cm_ada$byClass["F1"],
                          exp5_auc_ada))

# Display results matrix
results_matrix
##      Model                      Precision           Accuracy           
## [1,] "1 Decision Tree"          "0.902333812631346" "0.919036940871002"
## [2,] "2 Decision Tree Balanced" "0.809366130558184" "0.798357664233577"
## [3,] "3 Random Forest"          "0.911624820263245" "0.936141799198737"
## [4,] "4 Random Forest Tuned"    "0.906868709213583" "0.924310836501901"
## [5,] "5 Adaboost (adabag)"      "0.908306603251853" "0.933373712901272"
##      Recall              F1                  AUC                
## [1,] "0.975325651302605" "0.946345020356079" "0.741402625496973"
## [2,] "0.827814569536424" "0.812819321876451" "0.843501761916994"
## [3,] "0.965806613226453" "0.950742864188398" "0.937082004595757"
## [4,] "0.974323647294589" "0.948658536585366" "0.928571961804403"
## [5,] "0.965055110220441" "0.948950058501139" "0.932185563179339"

Experiment 6/6 AdaBoost and Hyperparameter Tuning

Define the objective of the experiment (hypothesis)

  • By adjusting key hyperparameters such as the number of iterations (trees) and the learning rate (eta), the model’s performance will be optimized, leading to better classification accuracy and generalization compared to the default settings.

Decide what will change, and what will stay the same

  • The hyperparameters of the Adaboost model, specifically nIter (number of iterations/trees) and eta (learning rate).

Select the evaluation metric (what you want to measure)

  • For all 6 experiments, the evaluation metrics are Precision, Accuracy, Recall, F1, and AUC

Perform the experiment

  • See below

Document the experiment so you compare results (track progress)

  • See below
# Set seed for reproducibility
set.seed(64)

# Split the data into training and testing sets
exp6_trainIndex <- createDataPartition(bank_df$y, p = 0.8, list = FALSE)
exp6_train <- bank_df[exp5_trainIndex, ]
exp6_test <- bank_df[-exp5_trainIndex, ]

# Define hyperparameter grid for 'mfinal' (number of boosting iterations)
mfinal_values <- c(25, 50, 75, 100)
exp6_results <- data.frame(mfinal = integer(), Accuracy = numeric(), stringsAsFactors = FALSE)

# Loop over mfinal values for manual tuning
for (mfinal in mfinal_values) {
  # Train the AdaBoost model using adabag with the current mfinal
  adaModel <- boosting(y ~ ., data = exp6_train, mfinal = mfinal)
  
  # Predict the classes on the test set
  exp6_predictions <- predict(adaModel, newdata = exp6_test)
  
  # Ensure the predicted classes and the actual classes have the same factor levels
  exp6_predictions$class <- factor(exp6_predictions$class, levels = levels(exp6_train$y))
  
  # Confusion Matrix
  exp6_confMat <- confusionMatrix(exp6_predictions$class, exp6_test$y)
  
  # Store the results (Accuracy in this case) in the results dataframe
  exp6_results <- rbind(exp6_results, data.frame(mfinal = mfinal, 
                                                 Accuracy = exp6_confMat$overall['Accuracy']))
}

# Print out the results for each mfinal
print(exp6_results)
##           mfinal  Accuracy
## Accuracy      25 0.9064263
## Accuracy1     50 0.9118460
## Accuracy2     75 0.9093021
## Accuracy3    100 0.9095233
# Find the best mfinal value (highest accuracy)
best_mfinal <- exp6_results[which.max(exp6_results$Accuracy), "mfinal"]
cat("Best mfinal value based on accuracy:", best_mfinal, "\n")
## Best mfinal value based on accuracy: 50
# Train the final model with the best mfinal
exp6_model <- boosting(y ~ ., data = exp6_train, mfinal = best_mfinal)

# Predict using the final model and calculate confusion matrix
final_predictions <- predict(exp6_model, newdata = exp6_test)
final_predictions$class <- factor(final_predictions$class, levels = levels(exp6_train$y))
exp6_cm <- confusionMatrix(final_predictions$class, exp5_test$y)

# Show the final confusion matrix
print(exp6_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7695  565
##        yes  289  492
##                                           
##                Accuracy : 0.9055          
##                  95% CI : (0.8993, 0.9115)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 4.41e-12        
##                                           
##                   Kappa : 0.4841          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9638          
##             Specificity : 0.4655          
##          Pos Pred Value : 0.9316          
##          Neg Pred Value : 0.6300          
##              Prevalence : 0.8831          
##          Detection Rate : 0.8511          
##    Detection Prevalence : 0.9136          
##       Balanced Accuracy : 0.7146          
##                                           
##        'Positive' Class : no              
## 
# Probability predictions for AUC calculation
exp6_probabilities_ada <- predict(exp6_model, newdata = exp6_test, type = "prob")$prob[,2]

# AUC calculation
exp6_auc <- auc(roc(exp6_test$y, exp6_probabilities_ada))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the results matrix
results_matrix <- rbind(results_matrix, 
                        c("6 Adaboost Tuned",
                          exp6_cm$overall["Accuracy"],
                          exp6_cm$byClass["Precision"],
                          exp6_cm$byClass["Recall"],
                          exp6_cm$byClass["F1"],
                          exp6_auc))

# Show results matrix
results_matrix
##      Model                      Precision           Accuracy           
## [1,] "1 Decision Tree"          "0.902333812631346" "0.919036940871002"
## [2,] "2 Decision Tree Balanced" "0.809366130558184" "0.798357664233577"
## [3,] "3 Random Forest"          "0.911624820263245" "0.936141799198737"
## [4,] "4 Random Forest Tuned"    "0.906868709213583" "0.924310836501901"
## [5,] "5 Adaboost (adabag)"      "0.908306603251853" "0.933373712901272"
## [6,] "6 Adaboost Tuned"         "0.905541422409026" "0.931598062953995"
##      Recall              F1                  AUC                
## [1,] "0.975325651302605" "0.946345020356079" "0.741402625496973"
## [2,] "0.827814569536424" "0.812819321876451" "0.843501761916994"
## [3,] "0.965806613226453" "0.950742864188398" "0.937082004595757"
## [4,] "0.974323647294589" "0.948658536585366" "0.928571961804403"
## [5,] "0.965055110220441" "0.948950058501139" "0.932185563179339"
## [6,] "0.963802605210421" "0.947426742181729" "0.930387975572716"