library(tidyverse)
library(caret)
library(randomForest)
library(rpart)
library(adabag)
library(pROC)
library(ada)
set.seed(02180)
bank <- read.csv("C:\\Users\\jashb\\OneDrive\\Documents\\Masters Data Science\\Spring 2025\\DATA 622\\Assignment 1\\DATA\\bank-additional\\bank-additional-full.csv", sep = ';')
bank <- bank %>%
filter(across(everything(), ~. != "unknown"))
## Warning: Using `across()` in `filter()` was deprecated in dplyr 1.0.8.
## ℹ Please use `if_any()` or `if_all()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Training and testsing data is split at 75%|25% split for the first default experiment
trainPart <- createDataPartition(bank$y, p = 0.75, list = F)
trainDat <- bank[trainPart, ]
testDat <- bank[-trainPart, ]
trainDat$y <- as.factor(trainDat$y)
testDat$y <- as.factor(testDat$y)
Limit tree complexity to a max depth of three. I think that an unscaled tree with a maximum depth of three will perform well.
tree_model1 <- rpart(y ~ ., data = trainDat, method = "class", control = list(maxdepth = 3))
tree_pred1 <- predict(tree_model1, testDat, type = "class")
ex1_cm <- confusionMatrix(tree_pred1, testDat$y)
print(ex1_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 6260 360
## yes 397 604
##
## Accuracy : 0.9007
## 95% CI : (0.8937, 0.9073)
## No Information Rate : 0.8735
## P-Value [Acc > NIR] : 9.742e-14
##
## Kappa : 0.5578
##
## Mcnemar's Test P-Value : 0.1907
##
## Sensitivity : 0.9404
## Specificity : 0.6266
## Pos Pred Value : 0.9456
## Neg Pred Value : 0.6034
## Prevalence : 0.8735
## Detection Rate : 0.8214
## Detection Prevalence : 0.8687
## Balanced Accuracy : 0.7835
##
## 'Positive' Class : no
##
cat("Accuracy of Experiment 1:", ex1_cm$overall["Accuracy"], "\n")
## Accuracy of Experiment 1: 0.9006692
Changing the split in the testing and training data, running again on a max depth of 3.
trainPart2 <- createDataPartition(bank$y, p = 0.85, list = F)
trainDat2 <- bank[trainPart2, ]
testDat2 <- bank[-trainPart2, ]
trainDat2$y <- as.factor(trainDat2$y)
testDat2$y <- as.factor(testDat2$y)
tree_model2 <- rpart(y ~ ., data = trainDat2, method = "class", control = list(maxdepth = 3))
tree_pred2 <- predict(tree_model2, testDat2, type = "class")
ex2_cm <- confusionMatrix(tree_pred2, testDat2$y)
print(ex2_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 3784 238
## yes 210 340
##
## Accuracy : 0.902
## 95% CI : (0.893, 0.9105)
## No Information Rate : 0.8736
## P-Value [Acc > NIR] : 1.245e-09
##
## Kappa : 0.547
##
## Mcnemar's Test P-Value : 0.2021
##
## Sensitivity : 0.9474
## Specificity : 0.5882
## Pos Pred Value : 0.9408
## Neg Pred Value : 0.6182
## Prevalence : 0.8736
## Detection Rate : 0.8276
## Detection Prevalence : 0.8797
## Balanced Accuracy : 0.7678
##
## 'Positive' Class : no
##
cat("Accuracy of Experiment 2:", ex2_cm$overall["Accuracy"], "\n")
## Accuracy of Experiment 2: 0.9020122
Analysis: While holding maxdepth constant at 3, and only changing the sampling for the training set from 75% to 85% really did not change any of the metrics a noticeable amount. Both experiments show that the decision tree model has around 90% accuracy against the test data (set at 25% and 15% respectively). Kappa values for each experiment remained nearly the same, indicating that there is middle of the pack agreement among both models, however, with Kappa hovering at 53% one can assume that its statistically only slightly better to use the model as opposed to guessing.
Standard Random Forest The standard random forest is likely to preform well
rfor_mod1 <- randomForest(y~., data = trainDat)
pred_rfmod1 <- predict(rfor_mod1, testDat)
rf_cm1 <- confusionMatrix(pred_rfmod1, testDat$y)
print(rf_cm1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 6447 508
## yes 210 456
##
## Accuracy : 0.9058
## 95% CI : (0.899, 0.9123)
## No Information Rate : 0.8735
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5087
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9685
## Specificity : 0.4730
## Pos Pred Value : 0.9270
## Neg Pred Value : 0.6847
## Prevalence : 0.8735
## Detection Rate : 0.8460
## Detection Prevalence : 0.9126
## Balanced Accuracy : 0.7207
##
## 'Positive' Class : no
##
cat("Accuracy of Experiment 3(RF):", rf_cm1$overall["Accuracy"], "\n")
## Accuracy of Experiment 3(RF): 0.9057866
The proposed experiment here for Random Forest is to let it grow past its alloted 100 standard amount of trees. I propose that if the trees grow to triple the standard than the accuracy should see a jump
rfor_mod2 <- randomForest(y~., data = trainDat, ntree = 300)
pred_rfmod2 <- predict(rfor_mod2, testDat)
rf_cm2 <- confusionMatrix(pred_rfmod2, testDat$y)
print(rf_cm2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 6438 504
## yes 219 460
##
## Accuracy : 0.9051
## 95% CI : (0.8983, 0.9116)
## No Information Rate : 0.8735
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5086
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9671
## Specificity : 0.4772
## Pos Pred Value : 0.9274
## Neg Pred Value : 0.6775
## Prevalence : 0.8735
## Detection Rate : 0.8448
## Detection Prevalence : 0.9109
## Balanced Accuracy : 0.7221
##
## 'Positive' Class : no
##
Analysis: When the number of trees was increased to 300 the performance was certainly improved. The Random Forest seems to do well correctly predicting no, but given that this decision has a monetary piece connected to it the false yes and false no predictions will be a problem, especially if swept under the rug.
trainDat$y <- factor(trainDat$y, levels = c("yes", "no"))
testDat$y <- factor(testDat$y, levels = levels(trainDat$y))
trainDat3 <- trainDat %>%
select(y,poutcome,campaign,duration,age)
testDat3 <- testDat %>%
select(y,poutcome,campaign,duration,age)
levels(trainDat$y)
## [1] "yes" "no"
levels(testDat$y)
## [1] "yes" "no"
adaboost_model <- ada(y~., data = trainDat3, iter = 50, nu = 0.1, control=rpart.control(maxdepth=1))
ada_pred <- predict(adaboost_model, testDat3)
ada_cf1 <- confusionMatrix(ada_pred, testDat3$y)
## Warning in confusionMatrix.default(ada_pred, testDat3$y): Levels are not in the
## same order for reference and data. Refactoring data to match.
print(ada_cf1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 33 9
## no 931 6648
##
## Accuracy : 0.8767
## 95% CI : (0.8691, 0.884)
## No Information Rate : 0.8735
## P-Value [Acc > NIR] : 0.2094
##
## Kappa : 0.0556
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.034232
## Specificity : 0.998648
## Pos Pred Value : 0.785714
## Neg Pred Value : 0.877161
## Prevalence : 0.126493
## Detection Rate : 0.004330
## Detection Prevalence : 0.005511
## Balanced Accuracy : 0.516440
##
## 'Positive' Class : yes
##
This experiment is aimed at improving the accuracy of the model. Increasing the depth from 1 to 3 should give more complex trees, while increasing the amount of iterations from the standard 100 to 200 should increase predictive power.
adaboost_model2 <- ada(y~., data = trainDat3, iter = 200, nu = 0.1, control=rpart.control(maxdepth=3))
ada_pred2 <- predict(adaboost_model2, testDat3)
ada_cf2 <- confusionMatrix(ada_pred2, testDat3$y)
## Warning in confusionMatrix.default(ada_pred2, testDat3$y): Levels are not in
## the same order for reference and data. Refactoring data to match.
print(ada_cf2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 330 175
## no 634 6482
##
## Accuracy : 0.8938
## 95% CI : (0.8867, 0.9007)
## No Information Rate : 0.8735
## P-Value [Acc > NIR] : 2.507e-08
##
## Kappa : 0.3968
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.34232
## Specificity : 0.97371
## Pos Pred Value : 0.65347
## Neg Pred Value : 0.91091
## Prevalence : 0.12649
## Detection Rate : 0.04330
## Detection Prevalence : 0.06626
## Balanced Accuracy : 0.65802
##
## 'Positive' Class : yes
##