DATA622_Assignment

Intent

This document contains all code for Assignment 2, to accompany my essay.

Libraries and Data

I began by loading set of libraries and the raw dataset:

library(tidyverse)
library(caret)
library(ggplot2)
library(rpart)
library(rattle)
library(ROSE)
library(pROC)
library(doParallel)
library(fastAdaboost)
library(xgboost)

# Load data
df <- read_csv("https://raw.githubusercontent.com/AmandaSFox/DATA622/refs/heads/main/Assignment_2/df_preprocessed.csv")

Preparing Data

The new dataset with the economic indicators underwent the same EDA as the previous assignment, with some key changes. Below, I remove the highly correlated new features, the post hoc variable “duration,” and create a matrix to hold metrics for each model.

#---------------------------------
# Prepare Data
#---------------------------------

# find highly correlated features (newly added features)
df_num <- df %>% 
  select(where(is.numeric))
cor_matrix <- df_num %>% 
  cor(use = "pairwise.complete.obs")
cor_df <- as.data.frame(as.table(cor_matrix)) %>%
  filter(Var1 != Var2) %>%                    
  filter(abs(Freq) > 0.1) %>%
  arrange(desc(abs(Freq)))

head(cor_df)

##           Var1         Var2      Freq
## 1    euribor3m emp.var.rate 0.9722447
## 2 emp.var.rate    euribor3m 0.9722447
## 3  nr.employed    euribor3m 0.9451544
## 4    euribor3m  nr.employed 0.9451544
## 5  nr.employed emp.var.rate 0.9069701
## 6 emp.var.rate  nr.employed 0.9069701

# Remove two of three correlated features
df <- df %>% 
  select(-emp.var.rate, -nr.employed)

#Remove post hoc call duration feature
df <- df %>% 
  select(-Duration)

# Change character cols to factors
character_columns <- sapply(df, is.character)
df[character_columns] <- lapply(df[character_columns], as.factor)

glimpse(df)

## Rows: 41,188
## Columns: 16
## $ Age                           <dbl> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, …
## $ Occupation                    <fct> housemaid, services, services, admin., s…
## $ Marital_Status                <fct> married, married, married, married, marr…
## $ Contact_Type                  <fct> telephone, telephone, telephone, telepho…
## $ Month                         <fct> may, may, may, may, may, may, may, may, …
## $ Day                           <fct> mon, mon, mon, mon, mon, mon, mon, mon, …
## $ Contacts_This_Campaign        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Days_Since_Last_Campaign      <dbl> 999, 999, 999, 999, 999, 999, 999, 999, …
## $ Contacts_Before_This_Campaign <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Previous_Outcome              <fct> nonexistent, nonexistent, nonexistent, n…
## $ cpi                           <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, …
## $ cci                           <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4…
## $ euribor3m                     <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857…
## $ Y                             <fct> no, no, no, no, no, no, no, no, no, no, …
## $ Education2                    <fct> less.than.hs, high.school, high.school, …
## $ Loan_Profile                  <fct> No Loans - No Default, No Loans - No Def…

summary(df)

##       Age              Occupation     Marital_Status     Contact_Type  
##  Min.   :17.00   admin.     :10422   divorced: 4612   cellular :26144  
##  1st Qu.:32.00   blue-collar: 9254   married :24928   telephone:15044  
##  Median :38.00   technician : 6743   single  :11568                    
##  Mean   :40.02   services   : 3969   unknown :   80                    
##  3rd Qu.:47.00   management : 2924                                     
##  Max.   :98.00   retired    : 1720                                     
##                  (Other)    : 6156                                     
##      Month        Day       Contacts_This_Campaign Days_Since_Last_Campaign
##  may    :13769   fri:7827   Min.   : 1.000         Min.   :  0.0           
##  jul    : 7174   mon:8514   1st Qu.: 1.000         1st Qu.:999.0           
##  aug    : 6178   thu:8623   Median : 2.000         Median :999.0           
##  jun    : 5318   tue:8090   Mean   : 2.568         Mean   :962.5           
##  nov    : 4101   wed:8134   3rd Qu.: 3.000         3rd Qu.:999.0           
##  apr    : 2632              Max.   :56.000         Max.   :999.0           
##  (Other): 2016                                                             
##  Contacts_Before_This_Campaign    Previous_Outcome      cpi       
##  Min.   :0.000                 failure    : 4252   Min.   :92.20  
##  1st Qu.:0.000                 nonexistent:35563   1st Qu.:93.08  
##  Median :0.000                 success    : 1373   Median :93.75  
##  Mean   :0.173                                     Mean   :93.58  
##  3rd Qu.:0.000                                     3rd Qu.:93.99  
##  Max.   :7.000                                     Max.   :94.77  
##                                                                   
##       cci          euribor3m       Y                       Education2   
##  Min.   :-50.8   Min.   :0.634   no :36548   high.school        : 9515  
##  1st Qu.:-42.7   1st Qu.:1.344   yes: 4640   less.than.hs       :12531  
##  Median :-41.8   Median :4.857               professional.course: 5243  
##  Mean   :-40.5   Mean   :3.621               university.degree  :12168  
##  3rd Qu.:-36.4   3rd Qu.:4.961               unknown            : 1731  
##  Max.   :-26.9   Max.   :5.045                                          
##                                                                         
##                      Loan_Profile  
##  Both Loans - No Default   : 2968  
##  Housing Loan - In Default :    1  
##  Housing Loan - No Default :14243  
##  No Loans - In Default     :    2  
##  No Loans - No Default     :21966  
##  Personal Loan - No Default: 2008  
##

# Initialize matrix to store metrics from each experiment
matrix_metrics <- matrix(NA, nrow = 0, ncol = 6) 
colnames(matrix_metrics) <- c("Model",
                              "Accuracy", 
                              "Precision", 
                              "Recall", 
                              "F1", 
                              "AUC")

# Turn on parallel processing (chatgpt)

num_cores <- detectCores() - 1 # Use one less core
cl <- makePSOCKcluster(num_cores)
registerDoParallel(cl)

Experiments

The first model is a simple decision tree without resampling or cross-validation as a baseline.

#---------------------------------
# 1. Decision Tree: Unbalanced Y 
#---------------------------------

# Partition 70/30

set.seed(1)
arr_sample_unbal <- createDataPartition(y = df$Y, 
                                  p = .7, 
                                  list = FALSE)
df_unbal_train <- df[arr_sample_unbal, ]
df_unbal_test <- df[-arr_sample_unbal, ]

# Train first model
set.seed(123)
model_dt_unbal <- rpart(Y ~ ., 
                    method = "class",
                    data = df_unbal_train)

fancyRpartPlot(model_dt_unbal)

# Test first model
predictions_unbal <- predict(model_dt_unbal, 
                              newdata = df_unbal_test, 
                              type = "class")

cm_unbal <- confusionMatrix(predictions_unbal, 
                            df_unbal_test$Y)

probabilities_unbal <- predict(model_dt_unbal, 
                               newdata = df_unbal_test, 
                               type = "prob")[, 2]

auc_unbal <- auc(roc(df_unbal_test$Y, 
                     probabilities_unbal))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics, 
                        c("1 Tree Unbalanced Y",
                          cm_unbal$overall["Accuracy"],
                          cm_unbal$byClass["Precision"],
                          cm_unbal$byClass["Recall"],
                          cm_unbal$byClass["F1"],
                          auc_unbal))

matrix_metrics

##      Model                 Accuracy            Precision          
## [1,] "1 Tree Unbalanced Y" "0.897782453868566" "0.902831990698447"
##      Recall              F1                  AUC                
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"

The second experiment balances the Y variable and applies another decision tree, to isolate the impact of balancing the Y.

#---------------------------------
# 2. Decision Tree: Balanced (ROSE)
#---------------------------------

# show imbalance in Y
df %>%
  count(Y) %>%
  mutate(percentage = n / sum(n) * 100)

## # A tibble: 2 × 3
##   Y         n percentage
##   <fct> <int>      <dbl>
## 1 no    36548       88.7
## 2 yes    4640       11.3

# Balance using ROSE()
df_bal <- ROSE(Y ~ ., data = df)$data

# Check the new balance
df_bal %>%
  count(Y) %>%
  mutate(percentage = n / sum(n) * 100)

##     Y     n percentage
## 1  no 20655    50.1481
## 2 yes 20533    49.8519

# Partition balanced data <- use for all remaining models
set.seed(2)
arr_sample_bal <- createDataPartition(y = df_bal$Y, 
                                      p = .7, 
                                      list = FALSE)
df_bal_train <- df_bal[arr_sample_bal, ]
df_bal_test <- df_bal[-arr_sample_bal, ]

# Train model
set.seed(234)
dt_bal <- rpart(Y ~ ., 
                  method = "class",
                  data = df_bal_train)

fancyRpartPlot(dt_bal)

# Test model
predictions_dt_bal <- predict(dt_bal, 
                             newdata = df_bal_test, 
                             type = "class")

cm_dt_bal <- confusionMatrix(predictions_dt_bal, 
                            df_bal_test$Y)

probabilities_dt_bal <- predict(dt_bal, 
                               newdata = df_bal_test, 
                               type = "prob")[, 2]

auc_dt_bal <- auc(roc(df_bal_test$Y, 
                     probabilities_dt_bal))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics, 
                        c("2 Tree Balanced Y",
                          cm_dt_bal$overall["Accuracy"],
                          cm_dt_bal$byClass["Precision"],
                          cm_dt_bal$byClass["Recall"],
                          cm_dt_bal$byClass["F1"],
                          auc_dt_bal))

matrix_metrics

##      Model                 Accuracy            Precision          
## [1,] "1 Tree Unbalanced Y" "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y"   "0.854229057061918" "0.833105957253297"
##      Recall              F1                  AUC                
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"

The third experiment applies the first ensemble model, a simple random forest with default settings (caret package), to isolate the impact of bagging.

#---------------------------------
# 3. Random Forest: Default settings
#---------------------------------

# Using same train/test dataset as above (balanced Y)

set.seed(345)
rf_default <- randomForest::randomForest(Y~.,
                                         data = df_bal_train)

rf_default_predictions <- predict(rf_default, 
                                  df_bal_test)

# Test model
predictions_rf_bal <- predict(rf_default, 
                             newdata = df_bal_test, 
                             type = "class")

cm_rf_bal <- confusionMatrix(predictions_rf_bal, 
                            df_bal_test$Y)

probabilities_rf_bal <- predict(rf_default, 
                               newdata = df_bal_test, 
                               type = "prob")[, 2]

auc_rf_bal <- auc(roc(df_bal_test$Y, 
                     probabilities_rf_bal))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics, 
                        c("3 Random Forest Default",
                          cm_rf_bal$overall["Accuracy"],
                          cm_rf_bal$byClass["Precision"],
                          cm_rf_bal$byClass["Recall"],
                          cm_rf_bal$byClass["F1"],
                          auc_rf_bal))

matrix_metrics

##      Model                     Accuracy            Precision          
## [1,] "1 Tree Unbalanced Y"     "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y"       "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
##      Recall              F1                  AUC                
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"

The fourth experiment tunes the hyperparameters of the random forest model using a grid search.

#---------------------------------
# 4. Random Forest: Hyperparameter tuned (grid)
#---------------------------------

# Define tuning grid for mtry values, five-fold cross-validation
rf_tuneGrid <- expand.grid(mtry=c(2,4,6,8,10,12)) #was 2,4,6,8
rf_control <- trainControl(method = "cv", number = 5)


# Train model (caret package with hyperparameter tuning)
set.seed(456)
rf_tuned <- caret::train(Y ~ ., 
                  data = df_bal_train, 
                  method = "rf", 
                  trControl = rf_control, 
                  tuneGrid = rf_tuneGrid)

# View caret's tuning selection results 
rf_tuned$results

##   mtry  Accuracy     Kappa  AccuracySD     KappaSD
## 1    2 0.8016158 0.6029691 0.004442844 0.008903456
## 2    4 0.8931429 0.7862685 0.003644773 0.007292805
## 3    6 0.9001486 0.8002878 0.004241572 0.008484616
## 4    8 0.9005301 0.8010498 0.004724323 0.009450303
## 5   10 0.9005300 0.8010473 0.005254109 0.010509515
## 6   12 0.9012236 0.8024322 0.005229625 0.010460645

rf_tuned$bestTune

##   mtry
## 6   12

rf_tuned$finalModel

## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 12
## 
##         OOB estimate of  error rate: 9.92%
## Confusion matrix:
##        no   yes class.error
## no  13207  1252  0.08658967
## yes  1607 12767  0.11179908

# Test model
predictions_rf_tuned <- predict(rf_tuned, 
                              newdata = df_bal_test, 
                              type = "raw") #caret requires raw or prob

cm_rf_tuned <- confusionMatrix(predictions_rf_tuned, 
                             df_bal_test$Y)

probabilities_rf_tuned <- predict(rf_tuned, 
                                newdata = df_bal_test, 
                                type = "prob")[, 2]

auc_rf_tuned <- auc(roc(df_bal_test$Y, 
                      probabilities_rf_tuned))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics, 
                        c("4 Random Forest Tuned",
                          cm_rf_tuned$overall["Accuracy"],
                          cm_rf_tuned$byClass["Precision"],
                          cm_rf_tuned$byClass["Recall"],
                          cm_rf_tuned$byClass["F1"],
                          auc_rf_tuned))

matrix_metrics

##      Model                     Accuracy            Precision          
## [1,] "1 Tree Unbalanced Y"     "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y"       "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
## [4,] "4 Random Forest Tuned"   "0.897369486038041" "0.886813186813187"
##      Recall              F1                  AUC                
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"
## [4,] "0.91171723692705"  "0.899092790068439" "0.959815743565893"

The fifth experiment is another type of ensemble model, Adaboost, which employs boosting instead of bagging. This was a very fussy model to apply, and I was not successful using default settings. Therefore, this model is tuned using hyperparameters.

#---------------------------------
# 5. Adaboost: Hyperparameter tuned (grid)
#---------------------------------

# Additional data prep for adaboost (chatgpt)
ada_train <- droplevels(df_bal_train)
str(ada_train)

## 'data.frame':    28833 obs. of  16 variables:
##  $ Age                          : num  51 42.5 34.6 39.7 36.6 ...
##  $ Occupation                   : Factor w/ 12 levels "admin.","blue-collar",..: 6 8 1 2 2 10 10 10 2 11 ...
##  $ Marital_Status               : Factor w/ 4 levels "divorced","married",..: 2 2 3 2 3 2 1 3 2 2 ...
##  $ Contact_Type                 : Factor w/ 2 levels "cellular","telephone": 1 1 1 2 2 1 2 1 1 2 ...
##  $ Month                        : Factor w/ 10 levels "apr","aug","dec",..: 2 7 4 7 4 8 5 7 8 7 ...
##  $ Day                          : Factor w/ 5 levels "fri","mon","thu",..: 2 3 3 5 2 5 2 1 5 1 ...
##  $ Contacts_This_Campaign       : num  0.5764 7.3468 0.0517 0.0437 2.7586 ...
##  $ Days_Since_Last_Campaign     : num  1011 985 993 1004 932 ...
##  $ Contacts_Before_This_Campaign: num  0.03447 0.9135 0.21489 -0.02306 0.00827 ...
##  $ Previous_Outcome             : Factor w/ 3 levels "failure","nonexistent",..: 2 1 2 2 2 2 2 2 2 2 ...
##  $ cpi                          : num  93.5 92.8 94.1 93.9 93.8 ...
##  $ cci                          : num  -36.6 -42.2 -44.2 -35 -46.9 ...
##  $ euribor3m                    : num  5.63 1.67 4.58 4.54 4.14 ...
##  $ Y                            : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Education2                   : Factor w/ 5 levels "high.school",..: 2 1 2 2 2 3 3 4 3 1 ...
##  $ Loan_Profile                 : Factor w/ 4 levels "Both Loans - No Default",..: 3 3 3 3 2 2 3 3 1 2 ...

# Define tuning grid
# only one parameter tunable

ada_tuneGrid <- expand.grid(nIter = c(25,50,75),
                            method = "Adaboost.M1")

ada_control <- trainControl(method = "cv", 
                            number = 5,
                            classProbs = TRUE,
                            summaryFunction = twoClassSummary)


# Train model (fastAdaboost package with hyperparameter tuning)
set.seed(567)
ada_tuned <- caret::train(Y ~ ., 
                         data = ada_train, 
                         method = "adaboost", #fastAdaboost package 
                         trControl = ada_control, 
                         tuneGrid = ada_tuneGrid,
                         metric = "ROC",
                         preProcess = c("center","scale"))

# View caret's tuning selection results 
ada_tuned$bestTune

##   nIter      method
## 3    75 Adaboost.M1

ada_tuned$results

##   nIter      method       ROC      Sens      Spec       ROCSD      SensSD
## 1    25 Adaboost.M1 0.9515057 0.9134791 0.8687908 0.001242549 0.003785952
## 2    50 Adaboost.M1 0.9545745 0.9132720 0.8749825 0.001080596 0.004465775
## 3    75 Adaboost.M1 0.9560734 0.9177672 0.8767913 0.001096651 0.002977316
##        SpecSD
## 1 0.005585708
## 2 0.003380664
## 3 0.003895649

ada_tuned$finalModel

## fastAdaboost::adaboost(formula = .outcome ~ ., data = dat, nIter = param$nIter)
## .outcome ~ .
## <environment: 0x000001d71cb93950>
## Dependent Variable: .outcome
## No of trees:75
## The weights of the trees are:1.2500541.0728941.0457791.0141090.98558630.9822520.95604720.97148590.94645380.95429770.92286750.95526210.94083450.95229630.87284620.95486240.91912080.92553920.9460750.94195680.92091560.95260510.94478220.94028870.90211510.93165290.97721050.98422020.88984810.99408480.96167580.97130450.94320620.95874270.94499110.96803810.97412420.93575720.96401630.96686790.88580591.018291.0358430.91143510.98618641.0255080.8045320.99560531.0063190.95741431.0455890.95386811.0972280.92830520.94818950.76388941.1309150.98122930.97393740.83635381.0103350.91416840.81548121.0472750.96316110.95291490.92435061.0250290.94745021.0537821.0080060.91431151.0854731.0720230.9930147

# Test model

# ada can't handle one record in test data: 
# Make a test copy just for AdaBoost
df_bal_test_ada <- df_bal_test

# Align factor levels for ALL factor columns
for (col in names(df_bal_test_ada)) {
  if (is.factor(df_bal_test_ada[[col]]) && col %in% names(ada_train)) {
    df_bal_test_ada[[col]] <- factor(df_bal_test_ada[[col]], 
                                     levels = levels(ada_train[[col]]))
  }
}

# Now drop any rows with NA introduced by unseen factor levels
df_bal_test_ada <- df_bal_test_ada %>%
  drop_na()

predictions_ada_tuned <- predict(ada_tuned, 
                                newdata = df_bal_test_ada, 
                                type = "raw")

cm_ada_tuned <- confusionMatrix(predictions_ada_tuned, 
                               df_bal_test_ada$Y)

probabilities_ada_tuned <- predict(ada_tuned, 
                                  newdata = df_bal_test_ada, 
                                  type = "prob")[, 2]

auc_ada_tuned <- auc(roc(df_bal_test_ada$Y, 
                        probabilities_ada_tuned))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics, 
                        c("5 Adaboost Tuned",
                          cm_ada_tuned$overall["Accuracy"],
                          cm_ada_tuned$byClass["Precision"],
                          cm_ada_tuned$byClass["Recall"],
                          cm_ada_tuned$byClass["F1"],
                          auc_ada_tuned))

matrix_metrics

##      Model                     Accuracy            Precision          
## [1,] "1 Tree Unbalanced Y"     "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y"       "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
## [4,] "4 Random Forest Tuned"   "0.897369486038041" "0.886813186813187"
## [5,] "5 Adaboost Tuned"        "0.891614052128865" "0.873423561980929"
##      Recall              F1                  AUC                
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"
## [4,] "0.91171723692705"  "0.899092790068439" "0.959815743565893"
## [5,] "0.916707021791768" "0.89454201779948"  "0.956248177663717"

Finally, experiments 6 and 7 are a different type of boosted ensemble model, xgBoost. I employed simple tuning and then expanded it to achieve the best results possible for comparison.

#---------------------------------
# 6. xgBoost: Hyperparameter tuned (grid)
#---------------------------------

xg_tuneGrid <- expand.grid(nrounds = 100, 
                           max_depth = c(3,6),
                           eta = c(0.1, 0.3),
                           gamma = 0,
                           colsample_bytree = 1,
                           min_child_weight = 1,
                           subsample = 1)

xg_control <- trainControl(method = "cv", 
                            number = 5,
                            classProbs = TRUE,
                            summaryFunction = twoClassSummary)

# Train model
set.seed(678)
xg_tuned <- caret::train(Y ~ ., 
                        data = ada_train, 
                        method = "xgbTree",  
                        trControl = xg_control, 
                        tuneGrid = xg_tuneGrid,
                        metric = "ROC",
                        preProcess = c("center","scale"))

# View caret's tuning selection results 
xg_tuned$bestTune

##   nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 4     100         6 0.3     0                1                1         1

xg_tuned$results

##   eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.1         3     0                1                1         1     100
## 3 0.3         3     0                1                1         1     100
## 2 0.1         6     0                1                1         1     100
## 4 0.3         6     0                1                1         1     100
##         ROC      Sens      Spec       ROCSD      SensSD      SpecSD
## 1 0.9532854 0.9277270 0.8529987 0.001408975 0.007315221 0.005049537
## 3 0.9590915 0.9269661 0.8680259 0.001406446 0.004132723 0.007325118
## 2 0.9586439 0.9257904 0.8664257 0.001087678 0.003351774 0.006721385
## 4 0.9592010 0.9208801 0.8754004 0.002383292 0.003095726 0.004855296

xg_tuned$finalModel

## ##### xgb.Booster
## raw: 285 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, objective = "binary:logistic")
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
## # of features: 44 
## niter: 100
## nfeatures : 44 
## xNames : Age Occupationblue-collar Occupationentrepreneur Occupationhousemaid Occupationmanagement Occupationretired Occupationself-employed Occupationservices Occupationstudent Occupationtechnician Occupationunemployed Occupationunknown Marital_Statusmarried Marital_Statussingle Marital_Statusunknown Contact_Typetelephone Monthaug Monthdec Monthjul Monthjun Monthmar Monthmay Monthnov Monthoct Monthsep Daymon Daythu Daytue Daywed Contacts_This_Campaign Days_Since_Last_Campaign Contacts_Before_This_Campaign Previous_Outcomenonexistent Previous_Outcomesuccess cpi cci euribor3m Education2less.than.hs Education2professional.course Education2university.degree Education2unknown Loan_ProfileHousing Loan - No Default Loan_ProfileNo Loans - No Default Loan_ProfilePersonal Loan - No Default 
## problemType : Classification 
## tuneValue :
##    nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 4     100         6 0.3     0                1                1         1
## obsLevels : no yes 
## param :
##  list()

# Test model
predictions_xg_tuned <- predict(xg_tuned, 
                                 newdata = df_bal_test_ada, 
                                 type = "raw")

cm_xg_tuned <- confusionMatrix(predictions_xg_tuned, 
                               df_bal_test_ada$Y)

probabilities_xg_tuned <- predict(xg_tuned, 
                                   newdata = df_bal_test_ada, 
                                   type = "prob")[, 2]

auc_xg_tuned <- auc(roc(df_bal_test_ada$Y, 
                         probabilities_xg_tuned))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics, 
                        c("6 xgBoost Tuned",
                          cm_xg_tuned$overall["Accuracy"],
                          cm_xg_tuned$byClass["Precision"],
                          cm_xg_tuned$byClass["Recall"],
                          cm_xg_tuned$byClass["F1"],
                          auc_xg_tuned))

matrix_metrics

##      Model                     Accuracy            Precision          
## [1,] "1 Tree Unbalanced Y"     "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y"       "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
## [4,] "4 Random Forest Tuned"   "0.897369486038041" "0.886813186813187"
## [5,] "5 Adaboost Tuned"        "0.891614052128865" "0.873423561980929"
## [6,] "6 xgBoost Tuned"         "0.894123360854784" "0.874942458186282"
##      Recall              F1                  AUC                
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"
## [4,] "0.91171723692705"  "0.899092790068439" "0.959815743565893"
## [5,] "0.916707021791768" "0.89454201779948"  "0.956248177663717"
## [6,] "0.920419693301049" "0.897105097545626" "0.958505522407873"

#---------------------------------
# 6b. xgBoost #2: Hyperparameter refined tuning (grid)
#---------------------------------

xg_tuneGrid_2 <- expand.grid(nrounds = c(300,500), 
                           max_depth = c(3,6),
                           eta = c(0.1, 0.3, 0.5, 0.7),
                           gamma = 0,
                           colsample_bytree = 1,
                           min_child_weight = 1,
                           subsample = 1)

xg_control <- trainControl(method = "cv", 
                           number = 5,
                           classProbs = TRUE,
                           summaryFunction = twoClassSummary)

matrix_metrics <- matrix_metrics[matrix_metrics[, "Model"] != "7 xgBoost Tuned 2", ]

# Train model
set.seed(7890)
xg_tuned2 <- caret::train(Y ~ ., 
                         data = ada_train, 
                         method = "xgbTree",  
                         trControl = xg_control, 
                         tuneGrid = xg_tuneGrid_2,
                         metric = "ROC",
                         preProcess = c("center","scale"))

# View caret's tuning selection results 
xg_tuned2$bestTune

##   nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 3     300         6 0.1     0                1                1         1

xg_tuned2$results

##    eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1  0.1         3     0                1                1         1     300
## 5  0.3         3     0                1                1         1     300
## 9  0.5         3     0                1                1         1     300
## 13 0.7         3     0                1                1         1     300
## 3  0.1         6     0                1                1         1     300
## 7  0.3         6     0                1                1         1     300
## 11 0.5         6     0                1                1         1     300
## 15 0.7         6     0                1                1         1     300
## 2  0.1         3     0                1                1         1     500
## 6  0.3         3     0                1                1         1     500
## 10 0.5         3     0                1                1         1     500
## 14 0.7         3     0                1                1         1     500
## 4  0.1         6     0                1                1         1     500
## 8  0.3         6     0                1                1         1     500
## 12 0.5         6     0                1                1         1     500
## 16 0.7         6     0                1                1         1     500
##          ROC      Sens      Spec        ROCSD      SensSD      SpecSD
## 1  0.9587113 0.9282110 0.8660077 0.0018139963 0.006936967 0.006714591
## 5  0.9602232 0.9256522 0.8735910 0.0011007549 0.008740264 0.003367419
## 9  0.9575008 0.9206034 0.8723390 0.0016350743 0.009509261 0.005291526
## 13 0.9537088 0.9119584 0.8702517 0.0018441841 0.007904567 0.006128714
## 3  0.9603112 0.9262747 0.8747039 0.0013420697 0.008548211 0.006593165
## 7  0.9558581 0.9178368 0.8739388 0.0020919100 0.007824727 0.004246587
## 11 0.9544239 0.9136180 0.8726171 0.0023557790 0.009677524 0.004608908
## 15 0.9511078 0.9114738 0.8694865 0.0022248176 0.007256943 0.004837835
## 2  0.9602386 0.9295253 0.8691383 0.0014284359 0.007079300 0.005460137
## 6  0.9590421 0.9221939 0.8737301 0.0008528056 0.007664142 0.004206917
## 10 0.9554188 0.9163155 0.8720605 0.0021965151 0.008240287 0.006674031
## 14 0.9517372 0.9113356 0.8705302 0.0016828370 0.005059644 0.005979794
## 4  0.9596356 0.9246838 0.8759562 0.0014189847 0.009058871 0.006341201
## 8  0.9551759 0.9170760 0.8733125 0.0022515964 0.008599235 0.006052744
## 12 0.9542882 0.9141714 0.8724083 0.0020691886 0.010539965 0.004771330
## 16 0.9519184 0.9116817 0.8710865 0.0018127377 0.009592941 0.006018068

xg_tuned2$finalModel

## ##### xgb.Booster
## raw: 809.3 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
##     gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
##     min_child_weight = param$min_child_weight, subsample = param$subsample), 
##     data = x, nrounds = param$nrounds, objective = "binary:logistic")
## params (as set within xgb.train):
##   eta = "0.1", max_depth = "6", gamma = "0", colsample_bytree = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
## # of features: 44 
## niter: 300
## nfeatures : 44 
## xNames : Age Occupationblue-collar Occupationentrepreneur Occupationhousemaid Occupationmanagement Occupationretired Occupationself-employed Occupationservices Occupationstudent Occupationtechnician Occupationunemployed Occupationunknown Marital_Statusmarried Marital_Statussingle Marital_Statusunknown Contact_Typetelephone Monthaug Monthdec Monthjul Monthjun Monthmar Monthmay Monthnov Monthoct Monthsep Daymon Daythu Daytue Daywed Contacts_This_Campaign Days_Since_Last_Campaign Contacts_Before_This_Campaign Previous_Outcomenonexistent Previous_Outcomesuccess cpi cci euribor3m Education2less.than.hs Education2professional.course Education2university.degree Education2unknown Loan_ProfileHousing Loan - No Default Loan_ProfileNo Loans - No Default Loan_ProfilePersonal Loan - No Default 
## problemType : Classification 
## tuneValue :
##    nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 3     300         6 0.1     0                1                1         1
## obsLevels : no yes 
## param :
##  list()

# Test model
predictions_xg_tuned2 <- predict(xg_tuned2, 
                                newdata = df_bal_test_ada, 
                                type = "raw")

cm_xg_tuned2 <- confusionMatrix(predictions_xg_tuned2, 
                                df_bal_test_ada$Y)

probabilities_xg_tuned2 <- predict(xg_tuned2, 
                                  newdata = df_bal_test_ada, 
                                  type = "prob")[, 2]

auc_xg_tuned2 <- auc(roc(df_bal_test_ada$Y, 
                        probabilities_xg_tuned2))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics, 
                        c("7 xgBoost Tuned 2",
                          cm_xg_tuned2$overall["Accuracy"],
                          cm_xg_tuned2$byClass["Precision"],
                          cm_xg_tuned2$byClass["Recall"],
                          cm_xg_tuned2$byClass["F1"],
                          auc_xg_tuned2))

matrix_metrics

##      Model                     Accuracy            Precision          
## [1,] "1 Tree Unbalanced Y"     "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y"       "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
## [4,] "4 Random Forest Tuned"   "0.897369486038041" "0.886813186813187"
## [5,] "5 Adaboost Tuned"        "0.891614052128865" "0.873423561980929"
## [6,] "6 xgBoost Tuned"         "0.894123360854784" "0.874942458186282"
## [7,] "7 xgBoost Tuned 2"       "0.896308887809616" "0.875688073394495"
##      Recall              F1                  AUC                
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"
## [4,] "0.91171723692705"  "0.899092790068439" "0.959815743565893"
## [5,] "0.916707021791768" "0.89454201779948"  "0.956248177663717"
## [6,] "0.920419693301049" "0.897105097545626" "0.958505522407873"
## [7,] "0.924455205811138" "0.899411071849234" "0.960484397787394"

#---------------------------------
# parallel processing stop (chatgpt)
#---------------------------------

stopCluster(cl)
registerDoSEQ()

DATA622_Assignment_2

Amanda Fox

3/23/2025

Intent

Libraries and Data

Preparing Data

Experiments