Libaries

library(tidyverse)
library(caret)
library(randomForest)
library(rpart)
library(adabag)
library(pROC)
library(ada)
set.seed(02180)

Import data

bank <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 622\\Assignment 1\\DATA\\bank-additional\\bank-additional-full.csv", sep = ';')
bank <- bank %>% 
    filter(across(everything(), ~. != "unknown"))
## Warning: Using `across()` in `filter()` was deprecated in dplyr 1.0.8.
## ℹ Please use `if_any()` or `if_all()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Experiments

Split the data

Training and testsing data is split at 75%|25% split for the first default experiment

trainPart <- createDataPartition(bank$y, p = 0.75, list = F)
trainDat <- bank[trainPart, ]
testDat <- bank[-trainPart, ]
trainDat$y <- as.factor(trainDat$y)
testDat$y <- as.factor(testDat$y)

1. Decision Trees

Experiment 1 - Decision Tree

Limit tree complexity to a max depth of three. I think that an unscaled tree with a maximum depth of three will perform well.

tree_model1 <- rpart(y ~ ., data = trainDat, method = "class", control = list(maxdepth = 3))
tree_pred1 <- predict(tree_model1, testDat, type = "class")
ex1_cm <- confusionMatrix(tree_pred1, testDat$y)
print(ex1_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  6260  360
##        yes  397  604
##                                           
##                Accuracy : 0.9007          
##                  95% CI : (0.8937, 0.9073)
##     No Information Rate : 0.8735          
##     P-Value [Acc > NIR] : 9.742e-14       
##                                           
##                   Kappa : 0.5578          
##                                           
##  Mcnemar's Test P-Value : 0.1907          
##                                           
##             Sensitivity : 0.9404          
##             Specificity : 0.6266          
##          Pos Pred Value : 0.9456          
##          Neg Pred Value : 0.6034          
##              Prevalence : 0.8735          
##          Detection Rate : 0.8214          
##    Detection Prevalence : 0.8687          
##       Balanced Accuracy : 0.7835          
##                                           
##        'Positive' Class : no              
## 
cat("Accuracy of Experiment 1:", ex1_cm$overall["Accuracy"], "\n")
## Accuracy of Experiment 1: 0.9006692

Experiment 2 - Decision Tree

Changing the split in the testing and training data, running again on a max depth of 3.

trainPart2 <- createDataPartition(bank$y, p = 0.85, list = F)
trainDat2 <- bank[trainPart2, ]
testDat2 <- bank[-trainPart2, ]
trainDat2$y <- as.factor(trainDat2$y)
testDat2$y <- as.factor(testDat2$y)

tree_model2 <- rpart(y ~ ., data = trainDat2, method = "class", control = list(maxdepth = 3))
tree_pred2 <- predict(tree_model2, testDat2, type = "class")
ex2_cm <- confusionMatrix(tree_pred2, testDat2$y)
print(ex2_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3784  238
##        yes  210  340
##                                          
##                Accuracy : 0.902          
##                  95% CI : (0.893, 0.9105)
##     No Information Rate : 0.8736         
##     P-Value [Acc > NIR] : 1.245e-09      
##                                          
##                   Kappa : 0.547          
##                                          
##  Mcnemar's Test P-Value : 0.2021         
##                                          
##             Sensitivity : 0.9474         
##             Specificity : 0.5882         
##          Pos Pred Value : 0.9408         
##          Neg Pred Value : 0.6182         
##              Prevalence : 0.8736         
##          Detection Rate : 0.8276         
##    Detection Prevalence : 0.8797         
##       Balanced Accuracy : 0.7678         
##                                          
##        'Positive' Class : no             
## 
cat("Accuracy of Experiment 2:", ex2_cm$overall["Accuracy"], "\n")
## Accuracy of Experiment 2: 0.9020122

Analysis: While holding maxdepth constant at 3, and only changing the sampling for the training set from 75% to 85% really did not change any of the metrics a noticeable amount. Both experiments show that the decision tree model has around 90% accuracy against the test data (set at 25% and 15% respectively). Kappa values for each experiment remained nearly the same, indicating that there is middle of the pack agreement among both models, however, with Kappa hovering at 53% one can assume that its statistically only slightly better to use the model as opposed to guessing.

2. Random Forest

Experiment 3:

Standard Random Forest The standard random forest is likely to preform well

rfor_mod1 <- randomForest(y~., data = trainDat)
pred_rfmod1 <- predict(rfor_mod1, testDat)
rf_cm1 <- confusionMatrix(pred_rfmod1, testDat$y)
print(rf_cm1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  6447  508
##        yes  210  456
##                                          
##                Accuracy : 0.9058         
##                  95% CI : (0.899, 0.9123)
##     No Information Rate : 0.8735         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5087         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9685         
##             Specificity : 0.4730         
##          Pos Pred Value : 0.9270         
##          Neg Pred Value : 0.6847         
##              Prevalence : 0.8735         
##          Detection Rate : 0.8460         
##    Detection Prevalence : 0.9126         
##       Balanced Accuracy : 0.7207         
##                                          
##        'Positive' Class : no             
## 
cat("Accuracy of Experiment 3(RF):", rf_cm1$overall["Accuracy"], "\n")
## Accuracy of Experiment 3(RF): 0.9057866

Experiment 4:

The proposed experiment here for Random Forest is to let it grow past its alloted 100 standard amount of trees. I propose that if the trees grow to triple the standard than the accuracy should see a jump

rfor_mod2 <- randomForest(y~., data = trainDat, ntree = 300)
pred_rfmod2 <- predict(rfor_mod2, testDat)
rf_cm2 <- confusionMatrix(pred_rfmod2, testDat$y)
print(rf_cm2)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  6438  504
##        yes  219  460
##                                           
##                Accuracy : 0.9051          
##                  95% CI : (0.8983, 0.9116)
##     No Information Rate : 0.8735          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5086          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9671          
##             Specificity : 0.4772          
##          Pos Pred Value : 0.9274          
##          Neg Pred Value : 0.6775          
##              Prevalence : 0.8735          
##          Detection Rate : 0.8448          
##    Detection Prevalence : 0.9109          
##       Balanced Accuracy : 0.7221          
##                                           
##        'Positive' Class : no              
## 

Analysis: When the number of trees was increased to 300 the performance was certainly improved. The Random Forest seems to do well correctly predicting no, but given that this decision has a monetary piece connected to it the false yes and false no predictions will be a problem, especially if swept under the rug.

3. Adaboost

Experiment 5: Paired down dataset with increased folds

trainDat$y <- factor(trainDat$y, levels = c("yes", "no"))
testDat$y <- factor(testDat$y, levels = levels(trainDat$y))

trainDat3 <- trainDat %>% 
    select(y,poutcome,campaign,duration,age)
testDat3 <- testDat %>% 
    select(y,poutcome,campaign,duration,age)
levels(trainDat$y)
## [1] "yes" "no"
levels(testDat$y)
## [1] "yes" "no"
adaboost_model <- ada(y~., data = trainDat3, iter = 50, nu = 0.1, control=rpart.control(maxdepth=1))
ada_pred <- predict(adaboost_model, testDat3)
ada_cf1 <- confusionMatrix(ada_pred, testDat3$y)
## Warning in confusionMatrix.default(ada_pred, testDat3$y): Levels are not in the
## same order for reference and data. Refactoring data to match.
print(ada_cf1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes   33    9
##        no   931 6648
##                                          
##                Accuracy : 0.8767         
##                  95% CI : (0.8691, 0.884)
##     No Information Rate : 0.8735         
##     P-Value [Acc > NIR] : 0.2094         
##                                          
##                   Kappa : 0.0556         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.034232       
##             Specificity : 0.998648       
##          Pos Pred Value : 0.785714       
##          Neg Pred Value : 0.877161       
##              Prevalence : 0.126493       
##          Detection Rate : 0.004330       
##    Detection Prevalence : 0.005511       
##       Balanced Accuracy : 0.516440       
##                                          
##        'Positive' Class : yes            
## 

Experiment 6: Increase boosting iterations and increasing max depth from 1 to 3

This experiment is aimed at improving the accuracy of the model. Increasing the depth from 1 to 3 should give more complex trees, while increasing the amount of iterations from the standard 100 to 200 should increase predictive power.

adaboost_model2 <- ada(y~., data = trainDat3, iter = 200, nu = 0.1, control=rpart.control(maxdepth=3))
ada_pred2 <- predict(adaboost_model2, testDat3)
ada_cf2 <- confusionMatrix(ada_pred2, testDat3$y)
## Warning in confusionMatrix.default(ada_pred2, testDat3$y): Levels are not in
## the same order for reference and data. Refactoring data to match.
print(ada_cf2)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  330  175
##        no   634 6482
##                                           
##                Accuracy : 0.8938          
##                  95% CI : (0.8867, 0.9007)
##     No Information Rate : 0.8735          
##     P-Value [Acc > NIR] : 2.507e-08       
##                                           
##                   Kappa : 0.3968          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.34232         
##             Specificity : 0.97371         
##          Pos Pred Value : 0.65347         
##          Neg Pred Value : 0.91091         
##              Prevalence : 0.12649         
##          Detection Rate : 0.04330         
##    Detection Prevalence : 0.06626         
##       Balanced Accuracy : 0.65802         
##                                           
##        'Positive' Class : yes             
##