library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(rpart)
library(ROCR)
library(adabag)
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
raw_data <- read.csv("https://raw.githubusercontent.com/suswong/DATA-622/refs/heads/main/bank-full.csv",sep=";")
The following preprocessing was done:
# We need to convert the variables to factor
bank_df <- raw_data %>%
mutate(across(where(is.character), as.factor)) %>%
mutate( y= as.factor(y))
# Encoding catgeorical variables
X_categorical <- bank_df %>%
select_if(is.factor) %>%
select(-y)
X_numerical <- bank_df %>%
select_if(is.numeric)
Y <- bank_df$y
dummy <- dummyVars(" ~ .", data = X_categorical, fullRank = TRUE)
encoded_categ <- predict(dummy, newdata = X_categorical)
encoded_df <- cbind(X_numerical, as.data.frame(encoded_categ), y=Y )
colnames(encoded_df) <- make.names(colnames(encoded_df))
set.seed(1)
trainIndex <- createDataPartition(encoded_df$y, p = 0.7, list = FALSE)
train_data <- encoded_df[trainIndex, ]
test_data <- encoded_df[-trainIndex, ]
Objective: Create simple tree using the default setting without any tuning as a baseline performance.
control_tree <- rpart(y ~ ., data = train_data, method = "class")
pred_control_prob <- predict(control_tree, test_data, type = "prob")
pred_control <- factor(ifelse(pred_control_prob[, "yes"] > 0.5, "yes", "no"), levels = levels(test_data$y))
conf_matrax_control <- confusionMatrix(pred_control, test_data$y)
roc_control <- roc(ifelse(test_data$y == "yes", 1, 0), pred_control_prob[, "yes"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_control <- auc(roc_control)
print(conf_matrax_control)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11658 1051
## yes 318 535
##
## Accuracy : 0.8991
## 95% CI : (0.8939, 0.9041)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1.783e-09
##
## Kappa : 0.3887
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9734
## Specificity : 0.3373
## Pos Pred Value : 0.9173
## Neg Pred Value : 0.6272
## Prevalence : 0.8831
## Detection Rate : 0.8596
## Detection Prevalence : 0.9371
## Balanced Accuracy : 0.6554
##
## 'Positive' Class : no
##
Objective: Investiagate whether Limiting the tree depth can improve the model’s generization and prevent overfitting.
dt_2 <- rpart(y ~ ., data = train_data, method = "class", control = rpart.control(maxdepth = 2))
pred_dt_2_prob <- predict(dt_2, test_data, type = "prob")
pred_dt_2 <- factor(ifelse(pred_dt_2_prob[, "yes"] > 0.5, "yes", "no"), levels = levels(test_data$y))
conf_matrax_dt_2 <- confusionMatrix(pred_dt_2, test_data$y)
roc_dt_2 <- roc(ifelse(test_data$y == "yes", 1, 0), pred_dt_2_prob[, "yes"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_dt_2 <- auc(roc_dt_2)
print(conf_matrax_dt_2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11604 1057
## yes 372 529
##
## Accuracy : 0.8946
## 95% CI : (0.8893, 0.8998)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1.114e-05
##
## Kappa : 0.3722
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9689
## Specificity : 0.3335
## Pos Pred Value : 0.9165
## Neg Pred Value : 0.5871
## Prevalence : 0.8831
## Detection Rate : 0.8556
## Detection Prevalence : 0.9336
## Balanced Accuracy : 0.6512
##
## 'Positive' Class : no
##
dt_3 <- rpart(y ~ ., data = train_data, method = "class", control = rpart.control(maxdepth =5))
pred_dt_3_prob <- predict(dt_3, test_data, type = "prob")
pred_dt_3 <- factor(ifelse(pred_dt_3_prob[, "yes"] > 0.5, "yes", "no"), levels = levels(test_data$y))
conf_matrax_dt_3 <- confusionMatrix(pred_dt_3, test_data$y)
roc_dt_3 <- roc(ifelse(test_data$y == "yes", 1, 0), pred_dt_3_prob[, "yes"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_dt_3 <- auc(roc_dt_3)
print(conf_matrax_dt_3)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11658 1051
## yes 318 535
##
## Accuracy : 0.8991
## 95% CI : (0.8939, 0.9041)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1.783e-09
##
## Kappa : 0.3887
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9734
## Specificity : 0.3373
## Pos Pred Value : 0.9173
## Neg Pred Value : 0.6272
## Prevalence : 0.8831
## Detection Rate : 0.8596
## Detection Prevalence : 0.9371
## Balanced Accuracy : 0.6554
##
## 'Positive' Class : no
##
Objective: Use the optimal cp value that minimize cross-validation error to see if it will improve the model’s generization and prevent overfitting.
The cp plot shows that as the size of the tree increases, the X-Val Relative Error initital decrease significantly and then level off. The high error at cp = infinite indicates underfitting as the single node tree is too simple to capture any underlying pattern. There is no evident of overfitting as we do not see cross-validation error increase as cp decrease. The optimal cp is 0.013 with the size of the tree = 6 as it has the lowest X-Val Relative Error.
plotcp(control_tree)
optimal_cp <- 0.013
# Prune the tree
pruned_tree <- prune(control_tree, cp = optimal_cp)
pred_pruned_tree_prob <- predict(pruned_tree, test_data, type = "prob")
pred_pruned_tree <- factor(ifelse(pred_pruned_tree_prob[, "yes"] > 0.5, "yes", "no"), levels = levels(test_data$y))
conf_matrax_pruned_tree <- confusionMatrix(pred_pruned_tree, test_data$y)
roc_pruned <- roc(ifelse(test_data$y == "yes", 1, 0), pred_pruned_tree_prob[, "yes"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_pruned <- auc(roc_pruned)
print(conf_matrax_pruned_tree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11658 1051
## yes 318 535
##
## Accuracy : 0.8991
## 95% CI : (0.8939, 0.9041)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1.783e-09
##
## Kappa : 0.3887
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9734
## Specificity : 0.3373
## Pos Pred Value : 0.9173
## Neg Pred Value : 0.6272
## Prevalence : 0.8831
## Detection Rate : 0.8596
## Detection Prevalence : 0.9371
## Balanced Accuracy : 0.6554
##
## 'Positive' Class : no
##
# Plot the pruned tree
#rpart.plot(pruned_tree)
The first model was a simple decision tree with default setting. It resulted a accuracy of 0.899, precision of 0.917, F1-score of 0.944, and AUC of 0.746. The next experiment was adjusting the max depth of the tree. The idea of limiting tree depth is to reduce overfitting and increase generalization. However, limiting the depth of the tree to a maximum of 3 has a drop in performance of 0.895. When the depth was increased to 5 and 10, the performance was the same as the first model with default setting. Finally, I pruned the tree based on a optimal complexity parameter (cp) by plotting cp. The pruned model resulted in a model similar to the default one. This indicate pruning did not greatly affect the performance of the model.
summary_comparison <- data.frame(
Model = c("Decision Tree (default)", "Decision Tree (Max-depth = 3)", "Decision Tree (Max-depth = 5)", "Decision Tree (Pruned)"),
Accuracy = c(conf_matrax_control$overall["Accuracy"],
conf_matrax_dt_2$overall["Accuracy"],
conf_matrax_dt_3$overall["Accuracy"],
conf_matrax_pruned_tree$overall["Accuracy"]),
Precision = c(conf_matrax_control$byClass["Pos Pred Value"],
conf_matrax_dt_2$byClass["Pos Pred Value"],
conf_matrax_dt_3$byClass["Pos Pred Value"],
conf_matrax_pruned_tree$byClass["Pos Pred Value"]),
F1_Score = c(conf_matrax_control$byClass["F1"],
conf_matrax_dt_2$byClass["F1"],
conf_matrax_dt_3$byClass["F1"],
conf_matrax_pruned_tree$byClass["F1"]),
AOC = c(auc_control,
auc_dt_2,
auc_dt_3,
auc_pruned),
Notes = c("Baseline Performance", "Slight drop due to restricted depth","same as default", "simplified but similar performance as default")
)
print(summary_comparison)
## Model Accuracy Precision F1_Score AOC
## 1 Decision Tree (default) 0.8990562 0.9173027 0.9445412 0.7465744
## 2 Decision Tree (Max-depth = 3) 0.8946321 0.9165153 0.9419978 0.7445256
## 3 Decision Tree (Max-depth = 5) 0.8990562 0.9173027 0.9445412 0.7465744
## 4 Decision Tree (Pruned) 0.8990562 0.9173027 0.9445412 0.7465744
## Notes
## 1 Baseline Performance
## 2 Slight drop due to restricted depth
## 3 same as default
## 4 simplified but similar performance as default
Objective:
rf_control <- randomForest(y ~ ., data = train_data, ntree = 100)
pred_rf_control_prob <- predict(rf_control, test_data, type = "prob")
pred_rf_control <- factor(ifelse(pred_rf_control_prob[, "yes"] > 0.5, "yes", "no"), levels = levels(test_data$y))
conf_matrax_rf_control <- confusionMatrix(pred_rf_control, test_data$y)
roc_rf_control <- roc(ifelse(test_data$y == "yes", 1, 0), pred_rf_control_prob[, "yes"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_rf_control <- auc(roc_rf_control)
print(conf_matrax_rf_control)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11690 1007
## yes 286 579
##
## Accuracy : 0.9047
## 95% CI : (0.8996, 0.9096)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 4.508e-16
##
## Kappa : 0.425
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9761
## Specificity : 0.3651
## Pos Pred Value : 0.9207
## Neg Pred Value : 0.6694
## Prevalence : 0.8831
## Detection Rate : 0.8620
## Detection Prevalence : 0.9362
## Balanced Accuracy : 0.6706
##
## 'Positive' Class : no
##
The top predictors in this model include duration
,
age
, balance
, day
,
poutcome.sucess
pdays
, campaign
,
housing.yes
. These features strongly influence whether
someone subscribes to a term deposit.
importance(rf_control)
## MeanDecreaseGini
## age 532.50727
## balance 502.45423
## day 450.54607
## duration 1591.92806
## campaign 194.43251
## pdays 265.90243
## previous 129.65296
## job.blue.collar 50.77950
## job.entrepreneur 20.31852
## job.housemaid 19.30646
## job.management 59.32040
## job.retired 30.20458
## job.self.employed 24.90566
## job.services 36.06417
## job.student 28.50754
## job.technician 59.73746
## job.unemployed 25.69614
## job.unknown 7.42874
## marital.married 67.80841
## marital.single 51.89833
## education.secondary 67.61013
## education.tertiary 66.53326
## education.unknown 31.33232
## default.yes 10.07857
## housing.yes 139.96472
## loan.yes 51.17866
## contact.telephone 39.99717
## contact.unknown 83.70437
## month.aug 61.09161
## month.dec 24.97938
## month.feb 51.16335
## month.jan 32.57420
## month.jul 56.87270
## month.jun 64.92817
## month.mar 85.55632
## month.may 58.83251
## month.nov 51.65899
## month.oct 73.49209
## month.sep 55.73884
## poutcome.other 24.02855
## poutcome.success 328.60444
## poutcome.unknown 50.90501
Objective: Investigate whether increasing the number of trees lead to better performance.
rf_2 <- randomForest(y ~ ., data = train_data, ntree = 200)
pred_rf_2_prob <- predict(rf_2, test_data, type = "prob")
pred_ref_2 <- factor(ifelse(pred_rf_2_prob[, "yes"] > 0.5, "yes", "no"), levels = levels(test_data$y))
conf_matrax_rf_2 <- confusionMatrix(pred_ref_2, test_data$y)
roc_rf_2 <- roc(ifelse(test_data$y == "yes", 1, 0), pred_rf_2_prob[, "yes"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_rf_2 <- auc(roc_rf_2)
print(conf_matrax_rf_control)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11690 1007
## yes 286 579
##
## Accuracy : 0.9047
## 95% CI : (0.8996, 0.9096)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 4.508e-16
##
## Kappa : 0.425
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9761
## Specificity : 0.3651
## Pos Pred Value : 0.9207
## Neg Pred Value : 0.6694
## Prevalence : 0.8831
## Detection Rate : 0.8620
## Detection Prevalence : 0.9362
## Balanced Accuracy : 0.6706
##
## 'Positive' Class : no
##
Objective: Investigate whether optimizing the number of features considered at each split will improve the model performance.
tune_rf <- tuneRF(train_data[,-ncol(train_data)], train_data$y,
stepFactor=1.5)
## mtry = 6 OOB error = 9.68%
## Searching left ...
## mtry = 4 OOB error = 9.83%
## -0.01599739 0.05
## Searching right ...
## mtry = 9 OOB error = 9.69%
## -0.001305909 0.05
rf_tuned <- randomForest(y ~ ., data = train_data, ntree = 100, mty=6)
pred_rf_tuned_prob <- predict(rf_tuned, test_data, type = "prob")
pred_ref_tuned <- factor(ifelse(pred_rf_tuned_prob[, "yes"] > 0.5, "yes", "no"), levels = levels(test_data$y))
conf_matrax_rf_tuned <- confusionMatrix(pred_ref_tuned, test_data$y)
roc_rf_tuned <- roc(ifelse(test_data$y == "yes", 1, 0), pred_rf_tuned_prob[, "yes"])
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_rf_tuned <- auc(roc_rf_tuned)
print(conf_matrax_rf_tuned)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11703 1016
## yes 273 570
##
## Accuracy : 0.905
## 95% CI : (0.8999, 0.9098)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4224
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9772
## Specificity : 0.3594
## Pos Pred Value : 0.9201
## Neg Pred Value : 0.6762
## Prevalence : 0.8831
## Detection Rate : 0.8629
## Detection Prevalence : 0.9378
## Balanced Accuracy : 0.6683
##
## 'Positive' Class : no
##
The default Random Forest model with 100 trees, performed better than the previous models with a accuracy of 0.905, precision of 0.920, F1-score of 0.948, and AUC of 0.928. Increasing the number of trees to 200 did not result a significant improvement in the model. In fact, the accuracy remains the same. Tuning the number of features with mtry at 6 as it has the lowest OOB error resulted a drop in accuracy but the precision was similar to the default model. Overall, the Random Forest model performed better than decision Tree model.
summary_comparison <- data.frame(
Model = c("Decision Tree (default)", "Decision Tree (Max-depth = 3)", "Decision Tree (Max-depth = 5)", "Decision Tree (Pruned)","Random Forest (default)", "Random Forest (ntree=200)", "Random Forest (Tuned-Mty=6)"),
Accuracy = c(conf_matrax_control$overall["Accuracy"],
conf_matrax_dt_2$overall["Accuracy"],
conf_matrax_dt_3$overall["Accuracy"],
conf_matrax_pruned_tree$overall["Accuracy"],
conf_matrax_rf_control$overall["Accuracy"],
conf_matrax_rf_2$overall["Accuracy"],
conf_matrax_rf_tuned$overall["Accuracy"]),
Precision = c(conf_matrax_control$byClass["Pos Pred Value"],
conf_matrax_dt_2$byClass["Pos Pred Value"],
conf_matrax_dt_3$byClass["Pos Pred Value"],
conf_matrax_pruned_tree$byClass["Pos Pred Value"],
conf_matrax_rf_control$byClass["Pos Pred Value"],
conf_matrax_rf_2$byClass["Pos Pred Value"],
conf_matrax_rf_tuned$byClass["Pos Pred Value"]),
F1_Score = c(conf_matrax_control$byClass["F1"],
conf_matrax_dt_2$byClass["F1"],
conf_matrax_dt_3$byClass["F1"],
conf_matrax_pruned_tree$byClass["F1"],
conf_matrax_rf_control$byClass["F1"],
conf_matrax_rf_2$byClass["F1"],
conf_matrax_rf_tuned$byClass["F1"]),
AOC = c(auc_control,
auc_dt_2,
auc_dt_3,
auc_pruned,
auc_rf_control,
auc_rf_2,
auc_rf_tuned),
Comments = c("Baseline Performance", "Slight drop due to restricted depth","same as default", "simplified but similar performance as default","","","")
)
print(summary_comparison)
## Model Accuracy Precision F1_Score AOC
## 1 Decision Tree (default) 0.8990562 0.9173027 0.9445412 0.7465744
## 2 Decision Tree (Max-depth = 3) 0.8946321 0.9165153 0.9419978 0.7445256
## 3 Decision Tree (Max-depth = 5) 0.8990562 0.9173027 0.9445412 0.7465744
## 4 Decision Tree (Pruned) 0.8990562 0.9173027 0.9445412 0.7465744
## 5 Random Forest (default) 0.9046601 0.9206899 0.9475945 0.9260661
## 6 Random Forest (ntree=200) 0.9057661 0.9211811 0.9482088 0.9280303
## 7 Random Forest (Tuned-Mty=6) 0.9049550 0.9201195 0.9478032 0.9280113
## Comments
## 1 Baseline Performance
## 2 Slight drop due to restricted depth
## 3 same as default
## 4 simplified but similar performance as default
## 5
## 6
## 7
Objective: Investigate whether boosting improves classification accuracy.
ab_control <- boosting(y ~ ., data = train_data, boos = TRUE, mfinal = 50)
pred_ab_control <- predict(ab_control, test_data)
pred_ab_control_prob <- pred_ab_control$prob[,2]
pred_ab_control_class <- factor(ifelse(pred_ab_control_prob > 0.5, "yes", "no"), levels = levels(test_data$y))
conf_matrax_ab_control <- confusionMatrix(pred_ab_control_class, test_data$y)
roc_ab_control <- roc(ifelse(test_data$y == "yes", 1, 0), pred_ab_control_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_ab_control <- auc(roc_ab_control)
print(conf_matrax_ab_control)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11549 866
## yes 427 720
##
## Accuracy : 0.9047
## 95% CI : (0.8996, 0.9096)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 4.508e-16
##
## Kappa : 0.4754
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9643
## Specificity : 0.4540
## Pos Pred Value : 0.9302
## Neg Pred Value : 0.6277
## Prevalence : 0.8831
## Detection Rate : 0.8516
## Detection Prevalence : 0.9154
## Balanced Accuracy : 0.7092
##
## 'Positive' Class : no
##
Objective: Investigate whether adjusting the parameter improve the model’s generization and prevent overfitting.
ab_2 <- boosting(y ~ ., data = train_data, boos = TRUE, mfinal = 100, control = rpart.control(cp=0.001))
pred_ab_2 <- predict(ab_2, test_data)
pred_ab_2_prob <- pred_ab_2$prob[,2]
pred_ab_2_class <- factor(ifelse(pred_ab_2_prob > 0.5, "yes", "no"), levels = levels(test_data$y))
conf_matrax_ab_2 <- confusionMatrix(pred_ab_2_class, test_data$y)
roc_ab_2 <- roc(ifelse(test_data$y == "yes", 1, 0), pred_ab_2_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_ab_2 <- auc(roc_ab_2)
print(conf_matrax_ab_2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 11491 829
## yes 485 757
##
## Accuracy : 0.9031
## 95% CI : (0.898, 0.908)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 4.822e-14
##
## Kappa : 0.4822
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9595
## Specificity : 0.4773
## Pos Pred Value : 0.9327
## Neg Pred Value : 0.6095
## Prevalence : 0.8831
## Detection Rate : 0.8473
## Detection Prevalence : 0.9084
## Balanced Accuracy : 0.7184
##
## 'Positive' Class : no
##
The default Adaboost model also performed better than the Decision Tree models with a accuracy of 0.904, precision of 0.930, F1-score of 0.947, and AUC of 0.924. Another model with adjusted hyperparameters showed a slight drop in performance, with a slight decrease in accuracy and AUC, but precision remain high. Overall, the Adaboost Forest model also performed better than decision Tree model.
summary_comparison <- data.frame(
Model = c("Decision Tree (default)", "Decision Tree (Max-depth = 3)", "Decision Tree (Max-depth = 5)", "Decision Tree (Pruned)","Random Forest (default)", "Random Forest (ntree=200)", "Random Forest (Tuned-Mty=6)", "AdaBoost (Default)", "AdaBoost (mfinal=100,cp=0.001"),
Accuracy = c(conf_matrax_control$overall["Accuracy"],
conf_matrax_dt_2$overall["Accuracy"],
conf_matrax_dt_3$overall["Accuracy"],
conf_matrax_pruned_tree$overall["Accuracy"],
conf_matrax_rf_control$overall["Accuracy"],
conf_matrax_rf_2$overall["Accuracy"],
conf_matrax_rf_tuned$overall["Accuracy"],
conf_matrax_ab_control$overall["Accuracy"],
conf_matrax_ab_2$overall["Accuracy"]),
Precision = c(conf_matrax_control$byClass["Pos Pred Value"],
conf_matrax_dt_2$byClass["Pos Pred Value"],
conf_matrax_dt_3$byClass["Pos Pred Value"],
conf_matrax_pruned_tree$byClass["Pos Pred Value"],
conf_matrax_rf_control$byClass["Pos Pred Value"],
conf_matrax_rf_2$byClass["Pos Pred Value"],
conf_matrax_rf_tuned$byClass["Pos Pred Value"],
conf_matrax_ab_control$byClass["Pos Pred Value"],
conf_matrax_ab_2$byClass["Pos Pred Value"]),
F1_Score = c(conf_matrax_control$byClass["F1"],
conf_matrax_dt_2$byClass["F1"],
conf_matrax_dt_3$byClass["F1"],
conf_matrax_pruned_tree$byClass["F1"],
conf_matrax_rf_control$byClass["F1"],
conf_matrax_rf_2$byClass["F1"],
conf_matrax_rf_tuned$byClass["F1"],
conf_matrax_ab_control$byClass["F1"],
conf_matrax_ab_2$byClass["F1"]),
AOC = c(auc_control,
auc_dt_2,
auc_dt_3,
auc_pruned,
auc_rf_control,
auc_rf_2,
auc_rf_tuned,
auc_ab_control,
auc_ab_2),
Comments = c("Baseline Performance", "Slight drop due to restricted depth","same as default", "simplified but similar performance as default","Baseline Performance; highest F1-score","Slight increase in AUC","similar to default","Baseline Performance with high presion","lower AUc but has slightly better precision")
)
print(summary_comparison)
## Model Accuracy Precision F1_Score AOC
## 1 Decision Tree (default) 0.8990562 0.9173027 0.9445412 0.7465744
## 2 Decision Tree (Max-depth = 3) 0.8946321 0.9165153 0.9419978 0.7445256
## 3 Decision Tree (Max-depth = 5) 0.8990562 0.9173027 0.9445412 0.7465744
## 4 Decision Tree (Pruned) 0.8990562 0.9173027 0.9445412 0.7465744
## 5 Random Forest (default) 0.9046601 0.9206899 0.9475945 0.9260661
## 6 Random Forest (ntree=200) 0.9057661 0.9211811 0.9482088 0.9280303
## 7 Random Forest (Tuned-Mty=6) 0.9049550 0.9201195 0.9478032 0.9280113
## 8 AdaBoost (Default) 0.9046601 0.9302457 0.9469886 0.9268785
## 9 AdaBoost (mfinal=100,cp=0.001 0.9031116 0.9327110 0.9459170 0.9201388
## Comments
## 1 Baseline Performance
## 2 Slight drop due to restricted depth
## 3 same as default
## 4 simplified but similar performance as default
## 5 Baseline Performance; highest F1-score
## 6 Slight increase in AUC
## 7 similar to default
## 8 Baseline Performance with high presion
## 9 lower AUc but has slightly better precision
Comparing all models, Random Forest and Adaboost models consistently outperform the Decision Tree Model with the Random Forest with default setting (ntree=100) to be the best. It has the highest AUC, precision, and a strong F1-score.
One limitation of this analysis is the use of a single train-test split to evaulate model performance, which may cause some variation in results. Cross-Validation would give a more reliable testing across multiple data splits. For further enhancement, we can use Cross-Validation to improve accruacy and reduce bias.
Another further enhancement would be to address class imbalance in the data set.
Encoding Categorical Variables https://amunategui.github.io/dummyVar-Walkthrough/