This document contains all code for Assignment 2, to accompany my essay.
I began by loading set of libraries and the raw dataset:
library(tidyverse)
library(caret)
library(ggplot2)
library(rpart)
library(rattle)
library(ROSE)
library(pROC)
library(doParallel)
library(fastAdaboost)
library(xgboost)
# Load data
df <- read_csv("https://raw.githubusercontent.com/AmandaSFox/DATA622/refs/heads/main/Assignment_2/df_preprocessed.csv")
The new dataset with the economic indicators underwent the same EDA as the previous assignment, with some key changes. Below, I remove the highly correlated new features, the post hoc variable “duration,” and create a matrix to hold metrics for each model.
#---------------------------------
# Prepare Data
#---------------------------------
# find highly correlated features (newly added features)
df_num <- df %>%
select(where(is.numeric))
cor_matrix <- df_num %>%
cor(use = "pairwise.complete.obs")
cor_df <- as.data.frame(as.table(cor_matrix)) %>%
filter(Var1 != Var2) %>%
filter(abs(Freq) > 0.1) %>%
arrange(desc(abs(Freq)))
head(cor_df)
## Var1 Var2 Freq
## 1 euribor3m emp.var.rate 0.9722447
## 2 emp.var.rate euribor3m 0.9722447
## 3 nr.employed euribor3m 0.9451544
## 4 euribor3m nr.employed 0.9451544
## 5 nr.employed emp.var.rate 0.9069701
## 6 emp.var.rate nr.employed 0.9069701
# Remove two of three correlated features
df <- df %>%
select(-emp.var.rate, -nr.employed)
#Remove post hoc call duration feature
df <- df %>%
select(-Duration)
# Change character cols to factors
character_columns <- sapply(df, is.character)
df[character_columns] <- lapply(df[character_columns], as.factor)
glimpse(df)
## Rows: 41,188
## Columns: 16
## $ Age <dbl> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, …
## $ Occupation <fct> housemaid, services, services, admin., s…
## $ Marital_Status <fct> married, married, married, married, marr…
## $ Contact_Type <fct> telephone, telephone, telephone, telepho…
## $ Month <fct> may, may, may, may, may, may, may, may, …
## $ Day <fct> mon, mon, mon, mon, mon, mon, mon, mon, …
## $ Contacts_This_Campaign <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Days_Since_Last_Campaign <dbl> 999, 999, 999, 999, 999, 999, 999, 999, …
## $ Contacts_Before_This_Campaign <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Previous_Outcome <fct> nonexistent, nonexistent, nonexistent, n…
## $ cpi <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, …
## $ cci <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4…
## $ euribor3m <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857…
## $ Y <fct> no, no, no, no, no, no, no, no, no, no, …
## $ Education2 <fct> less.than.hs, high.school, high.school, …
## $ Loan_Profile <fct> No Loans - No Default, No Loans - No Def…
summary(df)
## Age Occupation Marital_Status Contact_Type
## Min. :17.00 admin. :10422 divorced: 4612 cellular :26144
## 1st Qu.:32.00 blue-collar: 9254 married :24928 telephone:15044
## Median :38.00 technician : 6743 single :11568
## Mean :40.02 services : 3969 unknown : 80
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6156
## Month Day Contacts_This_Campaign Days_Since_Last_Campaign
## may :13769 fri:7827 Min. : 1.000 Min. : 0.0
## jul : 7174 mon:8514 1st Qu.: 1.000 1st Qu.:999.0
## aug : 6178 thu:8623 Median : 2.000 Median :999.0
## jun : 5318 tue:8090 Mean : 2.568 Mean :962.5
## nov : 4101 wed:8134 3rd Qu.: 3.000 3rd Qu.:999.0
## apr : 2632 Max. :56.000 Max. :999.0
## (Other): 2016
## Contacts_Before_This_Campaign Previous_Outcome cpi
## Min. :0.000 failure : 4252 Min. :92.20
## 1st Qu.:0.000 nonexistent:35563 1st Qu.:93.08
## Median :0.000 success : 1373 Median :93.75
## Mean :0.173 Mean :93.58
## 3rd Qu.:0.000 3rd Qu.:93.99
## Max. :7.000 Max. :94.77
##
## cci euribor3m Y Education2
## Min. :-50.8 Min. :0.634 no :36548 high.school : 9515
## 1st Qu.:-42.7 1st Qu.:1.344 yes: 4640 less.than.hs :12531
## Median :-41.8 Median :4.857 professional.course: 5243
## Mean :-40.5 Mean :3.621 university.degree :12168
## 3rd Qu.:-36.4 3rd Qu.:4.961 unknown : 1731
## Max. :-26.9 Max. :5.045
##
## Loan_Profile
## Both Loans - No Default : 2968
## Housing Loan - In Default : 1
## Housing Loan - No Default :14243
## No Loans - In Default : 2
## No Loans - No Default :21966
## Personal Loan - No Default: 2008
##
# Initialize matrix to store metrics from each experiment
matrix_metrics <- matrix(NA, nrow = 0, ncol = 6)
colnames(matrix_metrics) <- c("Model",
"Accuracy",
"Precision",
"Recall",
"F1",
"AUC")
# Turn on parallel processing (chatgpt)
num_cores <- detectCores() - 1 # Use one less core
cl <- makePSOCKcluster(num_cores)
registerDoParallel(cl)
The first model is a simple decision tree without resampling or cross-validation as a baseline.
#---------------------------------
# 1. Decision Tree: Unbalanced Y
#---------------------------------
# Partition 70/30
set.seed(1)
arr_sample_unbal <- createDataPartition(y = df$Y,
p = .7,
list = FALSE)
df_unbal_train <- df[arr_sample_unbal, ]
df_unbal_test <- df[-arr_sample_unbal, ]
# Train first model
set.seed(123)
model_dt_unbal <- rpart(Y ~ .,
method = "class",
data = df_unbal_train)
fancyRpartPlot(model_dt_unbal)
# Test first model
predictions_unbal <- predict(model_dt_unbal,
newdata = df_unbal_test,
type = "class")
cm_unbal <- confusionMatrix(predictions_unbal,
df_unbal_test$Y)
probabilities_unbal <- predict(model_dt_unbal,
newdata = df_unbal_test,
type = "prob")[, 2]
auc_unbal <- auc(roc(df_unbal_test$Y,
probabilities_unbal))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics,
c("1 Tree Unbalanced Y",
cm_unbal$overall["Accuracy"],
cm_unbal$byClass["Precision"],
cm_unbal$byClass["Recall"],
cm_unbal$byClass["F1"],
auc_unbal))
matrix_metrics
## Model Accuracy Precision
## [1,] "1 Tree Unbalanced Y" "0.897782453868566" "0.902831990698447"
## Recall F1 AUC
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
The second experiment balances the Y variable and applies another decision tree, to isolate the impact of balancing the Y.
#---------------------------------
# 2. Decision Tree: Balanced (ROSE)
#---------------------------------
# show imbalance in Y
df %>%
count(Y) %>%
mutate(percentage = n / sum(n) * 100)
## # A tibble: 2 × 3
## Y n percentage
## <fct> <int> <dbl>
## 1 no 36548 88.7
## 2 yes 4640 11.3
# Balance using ROSE()
df_bal <- ROSE(Y ~ ., data = df)$data
# Check the new balance
df_bal %>%
count(Y) %>%
mutate(percentage = n / sum(n) * 100)
## Y n percentage
## 1 no 20655 50.1481
## 2 yes 20533 49.8519
# Partition balanced data <- use for all remaining models
set.seed(2)
arr_sample_bal <- createDataPartition(y = df_bal$Y,
p = .7,
list = FALSE)
df_bal_train <- df_bal[arr_sample_bal, ]
df_bal_test <- df_bal[-arr_sample_bal, ]
# Train model
set.seed(234)
dt_bal <- rpart(Y ~ .,
method = "class",
data = df_bal_train)
fancyRpartPlot(dt_bal)
# Test model
predictions_dt_bal <- predict(dt_bal,
newdata = df_bal_test,
type = "class")
cm_dt_bal <- confusionMatrix(predictions_dt_bal,
df_bal_test$Y)
probabilities_dt_bal <- predict(dt_bal,
newdata = df_bal_test,
type = "prob")[, 2]
auc_dt_bal <- auc(roc(df_bal_test$Y,
probabilities_dt_bal))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics,
c("2 Tree Balanced Y",
cm_dt_bal$overall["Accuracy"],
cm_dt_bal$byClass["Precision"],
cm_dt_bal$byClass["Recall"],
cm_dt_bal$byClass["F1"],
auc_dt_bal))
matrix_metrics
## Model Accuracy Precision
## [1,] "1 Tree Unbalanced Y" "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y" "0.854229057061918" "0.833105957253297"
## Recall F1 AUC
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
The third experiment applies the first ensemble model, a simple random forest with default settings (caret package), to isolate the impact of bagging.
#---------------------------------
# 3. Random Forest: Default settings
#---------------------------------
# Using same train/test dataset as above (balanced Y)
set.seed(345)
rf_default <- randomForest::randomForest(Y~.,
data = df_bal_train)
rf_default_predictions <- predict(rf_default,
df_bal_test)
# Test model
predictions_rf_bal <- predict(rf_default,
newdata = df_bal_test,
type = "class")
cm_rf_bal <- confusionMatrix(predictions_rf_bal,
df_bal_test$Y)
probabilities_rf_bal <- predict(rf_default,
newdata = df_bal_test,
type = "prob")[, 2]
auc_rf_bal <- auc(roc(df_bal_test$Y,
probabilities_rf_bal))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics,
c("3 Random Forest Default",
cm_rf_bal$overall["Accuracy"],
cm_rf_bal$byClass["Precision"],
cm_rf_bal$byClass["Recall"],
cm_rf_bal$byClass["F1"],
auc_rf_bal))
matrix_metrics
## Model Accuracy Precision
## [1,] "1 Tree Unbalanced Y" "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y" "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
## Recall F1 AUC
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"
The fourth experiment tunes the hyperparameters of the random forest model using a grid search.
#---------------------------------
# 4. Random Forest: Hyperparameter tuned (grid)
#---------------------------------
# Define tuning grid for mtry values, five-fold cross-validation
rf_tuneGrid <- expand.grid(mtry=c(2,4,6,8,10,12)) #was 2,4,6,8
rf_control <- trainControl(method = "cv", number = 5)
# Train model (caret package with hyperparameter tuning)
set.seed(456)
rf_tuned <- caret::train(Y ~ .,
data = df_bal_train,
method = "rf",
trControl = rf_control,
tuneGrid = rf_tuneGrid)
# View caret's tuning selection results
rf_tuned$results
## mtry Accuracy Kappa AccuracySD KappaSD
## 1 2 0.8016158 0.6029691 0.004442844 0.008903456
## 2 4 0.8931429 0.7862685 0.003644773 0.007292805
## 3 6 0.9001486 0.8002878 0.004241572 0.008484616
## 4 8 0.9005301 0.8010498 0.004724323 0.009450303
## 5 10 0.9005300 0.8010473 0.005254109 0.010509515
## 6 12 0.9012236 0.8024322 0.005229625 0.010460645
rf_tuned$bestTune
## mtry
## 6 12
rf_tuned$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 12
##
## OOB estimate of error rate: 9.92%
## Confusion matrix:
## no yes class.error
## no 13207 1252 0.08658967
## yes 1607 12767 0.11179908
# Test model
predictions_rf_tuned <- predict(rf_tuned,
newdata = df_bal_test,
type = "raw") #caret requires raw or prob
cm_rf_tuned <- confusionMatrix(predictions_rf_tuned,
df_bal_test$Y)
probabilities_rf_tuned <- predict(rf_tuned,
newdata = df_bal_test,
type = "prob")[, 2]
auc_rf_tuned <- auc(roc(df_bal_test$Y,
probabilities_rf_tuned))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics,
c("4 Random Forest Tuned",
cm_rf_tuned$overall["Accuracy"],
cm_rf_tuned$byClass["Precision"],
cm_rf_tuned$byClass["Recall"],
cm_rf_tuned$byClass["F1"],
auc_rf_tuned))
matrix_metrics
## Model Accuracy Precision
## [1,] "1 Tree Unbalanced Y" "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y" "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
## [4,] "4 Random Forest Tuned" "0.897369486038041" "0.886813186813187"
## Recall F1 AUC
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"
## [4,] "0.91171723692705" "0.899092790068439" "0.959815743565893"
The fifth experiment is another type of ensemble model, Adaboost, which employs boosting instead of bagging. This was a very fussy model to apply, and I was not successful using default settings. Therefore, this model is tuned using hyperparameters.
#---------------------------------
# 5. Adaboost: Hyperparameter tuned (grid)
#---------------------------------
# Additional data prep for adaboost (chatgpt)
ada_train <- droplevels(df_bal_train)
str(ada_train)
## 'data.frame': 28833 obs. of 16 variables:
## $ Age : num 51 42.5 34.6 39.7 36.6 ...
## $ Occupation : Factor w/ 12 levels "admin.","blue-collar",..: 6 8 1 2 2 10 10 10 2 11 ...
## $ Marital_Status : Factor w/ 4 levels "divorced","married",..: 2 2 3 2 3 2 1 3 2 2 ...
## $ Contact_Type : Factor w/ 2 levels "cellular","telephone": 1 1 1 2 2 1 2 1 1 2 ...
## $ Month : Factor w/ 10 levels "apr","aug","dec",..: 2 7 4 7 4 8 5 7 8 7 ...
## $ Day : Factor w/ 5 levels "fri","mon","thu",..: 2 3 3 5 2 5 2 1 5 1 ...
## $ Contacts_This_Campaign : num 0.5764 7.3468 0.0517 0.0437 2.7586 ...
## $ Days_Since_Last_Campaign : num 1011 985 993 1004 932 ...
## $ Contacts_Before_This_Campaign: num 0.03447 0.9135 0.21489 -0.02306 0.00827 ...
## $ Previous_Outcome : Factor w/ 3 levels "failure","nonexistent",..: 2 1 2 2 2 2 2 2 2 2 ...
## $ cpi : num 93.5 92.8 94.1 93.9 93.8 ...
## $ cci : num -36.6 -42.2 -44.2 -35 -46.9 ...
## $ euribor3m : num 5.63 1.67 4.58 4.54 4.14 ...
## $ Y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Education2 : Factor w/ 5 levels "high.school",..: 2 1 2 2 2 3 3 4 3 1 ...
## $ Loan_Profile : Factor w/ 4 levels "Both Loans - No Default",..: 3 3 3 3 2 2 3 3 1 2 ...
# Define tuning grid
# only one parameter tunable
ada_tuneGrid <- expand.grid(nIter = c(25,50,75),
method = "Adaboost.M1")
ada_control <- trainControl(method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
# Train model (fastAdaboost package with hyperparameter tuning)
set.seed(567)
ada_tuned <- caret::train(Y ~ .,
data = ada_train,
method = "adaboost", #fastAdaboost package
trControl = ada_control,
tuneGrid = ada_tuneGrid,
metric = "ROC",
preProcess = c("center","scale"))
# View caret's tuning selection results
ada_tuned$bestTune
## nIter method
## 3 75 Adaboost.M1
ada_tuned$results
## nIter method ROC Sens Spec ROCSD SensSD
## 1 25 Adaboost.M1 0.9515057 0.9134791 0.8687908 0.001242549 0.003785952
## 2 50 Adaboost.M1 0.9545745 0.9132720 0.8749825 0.001080596 0.004465775
## 3 75 Adaboost.M1 0.9560734 0.9177672 0.8767913 0.001096651 0.002977316
## SpecSD
## 1 0.005585708
## 2 0.003380664
## 3 0.003895649
ada_tuned$finalModel
## fastAdaboost::adaboost(formula = .outcome ~ ., data = dat, nIter = param$nIter)
## .outcome ~ .
## <environment: 0x000001d71cb93950>
## Dependent Variable: .outcome
## No of trees:75
## The weights of the trees are:1.2500541.0728941.0457791.0141090.98558630.9822520.95604720.97148590.94645380.95429770.92286750.95526210.94083450.95229630.87284620.95486240.91912080.92553920.9460750.94195680.92091560.95260510.94478220.94028870.90211510.93165290.97721050.98422020.88984810.99408480.96167580.97130450.94320620.95874270.94499110.96803810.97412420.93575720.96401630.96686790.88580591.018291.0358430.91143510.98618641.0255080.8045320.99560531.0063190.95741431.0455890.95386811.0972280.92830520.94818950.76388941.1309150.98122930.97393740.83635381.0103350.91416840.81548121.0472750.96316110.95291490.92435061.0250290.94745021.0537821.0080060.91431151.0854731.0720230.9930147
# Test model
# ada can't handle one record in test data:
# Make a test copy just for AdaBoost
df_bal_test_ada <- df_bal_test
# Align factor levels for ALL factor columns
for (col in names(df_bal_test_ada)) {
if (is.factor(df_bal_test_ada[[col]]) && col %in% names(ada_train)) {
df_bal_test_ada[[col]] <- factor(df_bal_test_ada[[col]],
levels = levels(ada_train[[col]]))
}
}
# Now drop any rows with NA introduced by unseen factor levels
df_bal_test_ada <- df_bal_test_ada %>%
drop_na()
predictions_ada_tuned <- predict(ada_tuned,
newdata = df_bal_test_ada,
type = "raw")
cm_ada_tuned <- confusionMatrix(predictions_ada_tuned,
df_bal_test_ada$Y)
probabilities_ada_tuned <- predict(ada_tuned,
newdata = df_bal_test_ada,
type = "prob")[, 2]
auc_ada_tuned <- auc(roc(df_bal_test_ada$Y,
probabilities_ada_tuned))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics,
c("5 Adaboost Tuned",
cm_ada_tuned$overall["Accuracy"],
cm_ada_tuned$byClass["Precision"],
cm_ada_tuned$byClass["Recall"],
cm_ada_tuned$byClass["F1"],
auc_ada_tuned))
matrix_metrics
## Model Accuracy Precision
## [1,] "1 Tree Unbalanced Y" "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y" "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
## [4,] "4 Random Forest Tuned" "0.897369486038041" "0.886813186813187"
## [5,] "5 Adaboost Tuned" "0.891614052128865" "0.873423561980929"
## Recall F1 AUC
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"
## [4,] "0.91171723692705" "0.899092790068439" "0.959815743565893"
## [5,] "0.916707021791768" "0.89454201779948" "0.956248177663717"
Finally, experiments 6 and 7 are a different type of boosted ensemble model, xgBoost. I employed simple tuning and then expanded it to achieve the best results possible for comparison.
#---------------------------------
# 6. xgBoost: Hyperparameter tuned (grid)
#---------------------------------
xg_tuneGrid <- expand.grid(nrounds = 100,
max_depth = c(3,6),
eta = c(0.1, 0.3),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1)
xg_control <- trainControl(method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
# Train model
set.seed(678)
xg_tuned <- caret::train(Y ~ .,
data = ada_train,
method = "xgbTree",
trControl = xg_control,
tuneGrid = xg_tuneGrid,
metric = "ROC",
preProcess = c("center","scale"))
# View caret's tuning selection results
xg_tuned$bestTune
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 4 100 6 0.3 0 1 1 1
xg_tuned$results
## eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.1 3 0 1 1 1 100
## 3 0.3 3 0 1 1 1 100
## 2 0.1 6 0 1 1 1 100
## 4 0.3 6 0 1 1 1 100
## ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.9532854 0.9277270 0.8529987 0.001408975 0.007315221 0.005049537
## 3 0.9590915 0.9269661 0.8680259 0.001406446 0.004132723 0.007325118
## 2 0.9586439 0.9257904 0.8664257 0.001087678 0.003351774 0.006721385
## 4 0.9592010 0.9208801 0.8754004 0.002383292 0.003095726 0.004855296
xg_tuned$finalModel
## ##### xgb.Booster
## raw: 285 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, objective = "binary:logistic")
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## callbacks:
## cb.print.evaluation(period = print_every_n)
## # of features: 44
## niter: 100
## nfeatures : 44
## xNames : Age Occupationblue-collar Occupationentrepreneur Occupationhousemaid Occupationmanagement Occupationretired Occupationself-employed Occupationservices Occupationstudent Occupationtechnician Occupationunemployed Occupationunknown Marital_Statusmarried Marital_Statussingle Marital_Statusunknown Contact_Typetelephone Monthaug Monthdec Monthjul Monthjun Monthmar Monthmay Monthnov Monthoct Monthsep Daymon Daythu Daytue Daywed Contacts_This_Campaign Days_Since_Last_Campaign Contacts_Before_This_Campaign Previous_Outcomenonexistent Previous_Outcomesuccess cpi cci euribor3m Education2less.than.hs Education2professional.course Education2university.degree Education2unknown Loan_ProfileHousing Loan - No Default Loan_ProfileNo Loans - No Default Loan_ProfilePersonal Loan - No Default
## problemType : Classification
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 4 100 6 0.3 0 1 1 1
## obsLevels : no yes
## param :
## list()
# Test model
predictions_xg_tuned <- predict(xg_tuned,
newdata = df_bal_test_ada,
type = "raw")
cm_xg_tuned <- confusionMatrix(predictions_xg_tuned,
df_bal_test_ada$Y)
probabilities_xg_tuned <- predict(xg_tuned,
newdata = df_bal_test_ada,
type = "prob")[, 2]
auc_xg_tuned <- auc(roc(df_bal_test_ada$Y,
probabilities_xg_tuned))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics,
c("6 xgBoost Tuned",
cm_xg_tuned$overall["Accuracy"],
cm_xg_tuned$byClass["Precision"],
cm_xg_tuned$byClass["Recall"],
cm_xg_tuned$byClass["F1"],
auc_xg_tuned))
matrix_metrics
## Model Accuracy Precision
## [1,] "1 Tree Unbalanced Y" "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y" "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
## [4,] "4 Random Forest Tuned" "0.897369486038041" "0.886813186813187"
## [5,] "5 Adaboost Tuned" "0.891614052128865" "0.873423561980929"
## [6,] "6 xgBoost Tuned" "0.894123360854784" "0.874942458186282"
## Recall F1 AUC
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"
## [4,] "0.91171723692705" "0.899092790068439" "0.959815743565893"
## [5,] "0.916707021791768" "0.89454201779948" "0.956248177663717"
## [6,] "0.920419693301049" "0.897105097545626" "0.958505522407873"
#---------------------------------
# 6b. xgBoost #2: Hyperparameter refined tuning (grid)
#---------------------------------
xg_tuneGrid_2 <- expand.grid(nrounds = c(300,500),
max_depth = c(3,6),
eta = c(0.1, 0.3, 0.5, 0.7),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1)
xg_control <- trainControl(method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
matrix_metrics <- matrix_metrics[matrix_metrics[, "Model"] != "7 xgBoost Tuned 2", ]
# Train model
set.seed(7890)
xg_tuned2 <- caret::train(Y ~ .,
data = ada_train,
method = "xgbTree",
trControl = xg_control,
tuneGrid = xg_tuneGrid_2,
metric = "ROC",
preProcess = c("center","scale"))
# View caret's tuning selection results
xg_tuned2$bestTune
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 3 300 6 0.1 0 1 1 1
xg_tuned2$results
## eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.1 3 0 1 1 1 300
## 5 0.3 3 0 1 1 1 300
## 9 0.5 3 0 1 1 1 300
## 13 0.7 3 0 1 1 1 300
## 3 0.1 6 0 1 1 1 300
## 7 0.3 6 0 1 1 1 300
## 11 0.5 6 0 1 1 1 300
## 15 0.7 6 0 1 1 1 300
## 2 0.1 3 0 1 1 1 500
## 6 0.3 3 0 1 1 1 500
## 10 0.5 3 0 1 1 1 500
## 14 0.7 3 0 1 1 1 500
## 4 0.1 6 0 1 1 1 500
## 8 0.3 6 0 1 1 1 500
## 12 0.5 6 0 1 1 1 500
## 16 0.7 6 0 1 1 1 500
## ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.9587113 0.9282110 0.8660077 0.0018139963 0.006936967 0.006714591
## 5 0.9602232 0.9256522 0.8735910 0.0011007549 0.008740264 0.003367419
## 9 0.9575008 0.9206034 0.8723390 0.0016350743 0.009509261 0.005291526
## 13 0.9537088 0.9119584 0.8702517 0.0018441841 0.007904567 0.006128714
## 3 0.9603112 0.9262747 0.8747039 0.0013420697 0.008548211 0.006593165
## 7 0.9558581 0.9178368 0.8739388 0.0020919100 0.007824727 0.004246587
## 11 0.9544239 0.9136180 0.8726171 0.0023557790 0.009677524 0.004608908
## 15 0.9511078 0.9114738 0.8694865 0.0022248176 0.007256943 0.004837835
## 2 0.9602386 0.9295253 0.8691383 0.0014284359 0.007079300 0.005460137
## 6 0.9590421 0.9221939 0.8737301 0.0008528056 0.007664142 0.004206917
## 10 0.9554188 0.9163155 0.8720605 0.0021965151 0.008240287 0.006674031
## 14 0.9517372 0.9113356 0.8705302 0.0016828370 0.005059644 0.005979794
## 4 0.9596356 0.9246838 0.8759562 0.0014189847 0.009058871 0.006341201
## 8 0.9551759 0.9170760 0.8733125 0.0022515964 0.008599235 0.006052744
## 12 0.9542882 0.9141714 0.8724083 0.0020691886 0.010539965 0.004771330
## 16 0.9519184 0.9116817 0.8710865 0.0018127377 0.009592941 0.006018068
xg_tuned2$finalModel
## ##### xgb.Booster
## raw: 809.3 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, objective = "binary:logistic")
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0", colsample_bytree = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## callbacks:
## cb.print.evaluation(period = print_every_n)
## # of features: 44
## niter: 300
## nfeatures : 44
## xNames : Age Occupationblue-collar Occupationentrepreneur Occupationhousemaid Occupationmanagement Occupationretired Occupationself-employed Occupationservices Occupationstudent Occupationtechnician Occupationunemployed Occupationunknown Marital_Statusmarried Marital_Statussingle Marital_Statusunknown Contact_Typetelephone Monthaug Monthdec Monthjul Monthjun Monthmar Monthmay Monthnov Monthoct Monthsep Daymon Daythu Daytue Daywed Contacts_This_Campaign Days_Since_Last_Campaign Contacts_Before_This_Campaign Previous_Outcomenonexistent Previous_Outcomesuccess cpi cci euribor3m Education2less.than.hs Education2professional.course Education2university.degree Education2unknown Loan_ProfileHousing Loan - No Default Loan_ProfileNo Loans - No Default Loan_ProfilePersonal Loan - No Default
## problemType : Classification
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 3 300 6 0.1 0 1 1 1
## obsLevels : no yes
## param :
## list()
# Test model
predictions_xg_tuned2 <- predict(xg_tuned2,
newdata = df_bal_test_ada,
type = "raw")
cm_xg_tuned2 <- confusionMatrix(predictions_xg_tuned2,
df_bal_test_ada$Y)
probabilities_xg_tuned2 <- predict(xg_tuned2,
newdata = df_bal_test_ada,
type = "prob")[, 2]
auc_xg_tuned2 <- auc(roc(df_bal_test_ada$Y,
probabilities_xg_tuned2))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Add metrics to the matrix
matrix_metrics <- rbind(matrix_metrics,
c("7 xgBoost Tuned 2",
cm_xg_tuned2$overall["Accuracy"],
cm_xg_tuned2$byClass["Precision"],
cm_xg_tuned2$byClass["Recall"],
cm_xg_tuned2$byClass["F1"],
auc_xg_tuned2))
matrix_metrics
## Model Accuracy Precision
## [1,] "1 Tree Unbalanced Y" "0.897782453868566" "0.902831990698447"
## [2,] "2 Tree Balanced Y" "0.854229057061918" "0.833105957253297"
## [3,] "3 Random Forest Default" "0.897288547146904" "0.892464553130476"
## [4,] "4 Random Forest Tuned" "0.897369486038041" "0.886813186813187"
## [5,] "5 Adaboost Tuned" "0.891614052128865" "0.873423561980929"
## [6,] "6 xgBoost Tuned" "0.894123360854784" "0.874942458186282"
## [7,] "7 xgBoost Tuned 2" "0.896308887809616" "0.875688073394495"
## Recall F1 AUC
## [1,] "0.991517694272163" "0.945098891545316" "0.682583832354162"
## [2,] "0.887023886378309" "0.859219885875088" "0.893630301738175"
## [3,] "0.904131697869593" "0.898260242122985" "0.960051218563459"
## [4,] "0.91171723692705" "0.899092790068439" "0.959815743565893"
## [5,] "0.916707021791768" "0.89454201779948" "0.956248177663717"
## [6,] "0.920419693301049" "0.897105097545626" "0.958505522407873"
## [7,] "0.924455205811138" "0.899411071849234" "0.960484397787394"
#---------------------------------
# parallel processing stop (chatgpt)
#---------------------------------
stopCluster(cl)
registerDoSEQ()