# Load libraries
library(tidyverse)
library(caret)
library(rpart)
library(randomForest)
library(adabag)
library(pROC)
# Load the data
bank <- read.csv("https://raw.githubusercontent.com/yli1048/yli1048/refs/heads/622/bank-full.csv", sep=";")
# Check the structure of the dataset
glimpse(bank)
## Rows: 45,211
## Columns: 17
## $ age <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
# Convert target variable 'y' to factor (binary classification: "yes" or "no")
bank$y <- as.factor(bank$y)
# Split dataset into training (80%) and testing (20%)
set.seed(1225)
train_index <- createDataPartition(bank$y, p=0.8, list=FALSE)
train_data <- bank[train_index, ]
test_data <- bank[-train_index, ]
# Normalize numerical features
num_cols <- sapply(train_data, is.numeric)
train_data[, num_cols] <- scale(train_data[, num_cols])
test_data[, num_cols] <- scale(test_data[, num_cols])
evaluate_model <- function(model, test_data, predictions, model_name) {
cm <- confusionMatrix(predictions, test_data$y)
auc <- roc(as.numeric(test_data$y) - 1, as.numeric(predictions) - 1)
cat("\nModel:", model_name, "\n")
cat("Accuracy:", cm$overall["Accuracy"], "\n")
cat("Precision:", cm$byClass["Precision"], "\n")
cat("Recall:", cm$byClass["Recall"], "\n")
cat("F1 Score:", cm$byClass["F1"], "\n")
cat("AUC:", auc$auc, "\n")
return(cm)
}
Hypothesis: Increasing the depth of the tree can enhance the accuracy of the predictions.
Hyperparameter: Similarity: - The dataset is split into 80% for training and 20% for testing.
Difference: - Tree depth
Evaluation metric: - Accuracy - Precision - Recall - F1-score - AUC-ROC
dt_model1 <- rpart(y ~ ., data=train_data, method="class", control=rpart.control(cp=0.01))
dt_predictions1 <- predict(dt_model1, test_data, type="class")
# Evaluate
evaluate_model(dt_model1, test_data, dt_predictions1, "Decision Tree (cp=0.01)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Model: Decision Tree (cp=0.01)
## Accuracy: 0.9024444
## Precision: 0.9203362
## Recall: 0.9738226
## F1 Score: 0.9463242
## AUC: 0.6685575
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7775 673
## yes 209 384
##
## Accuracy : 0.9024
## 95% CI : (0.8961, 0.9085)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 2.393e-09
##
## Kappa : 0.4164
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9738
## Specificity : 0.3633
## Pos Pred Value : 0.9203
## Neg Pred Value : 0.6476
## Prevalence : 0.8831
## Detection Rate : 0.8600
## Detection Prevalence : 0.9344
## Balanced Accuracy : 0.6686
##
## 'Positive' Class : no
##
dt_model2 <- rpart(y ~ ., data=train_data, method="class", control=rpart.control(maxdepth=10, minsplit=20))
dt_predictions2 <- predict(dt_model2, test_data, type="class")
# Evaluate
evaluate_model(dt_model2, test_data, dt_predictions2, "Decision Tree (maxdepth=10, minsplit=20)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Model: Decision Tree (maxdepth=10, minsplit=20)
## Accuracy: 0.9024444
## Precision: 0.9203362
## Recall: 0.9738226
## F1 Score: 0.9463242
## AUC: 0.6685575
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7775 673
## yes 209 384
##
## Accuracy : 0.9024
## 95% CI : (0.8961, 0.9085)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 2.393e-09
##
## Kappa : 0.4164
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9738
## Specificity : 0.3633
## Pos Pred Value : 0.9203
## Neg Pred Value : 0.6476
## Prevalence : 0.8831
## Detection Rate : 0.8600
## Detection Prevalence : 0.9344
## Balanced Accuracy : 0.6686
##
## 'Positive' Class : no
##
Results:
The accuracy, precision, recall, F1 score, and AUC of the decision tree with increased tree depth are all identical to those of the decision tree model using the default hyperparameters. This suggests that increasing the tree depth does not enhance the precision of the predictions.
Hypothesis: Increasing the number of trees and features can enhance the accuracy of the predictions.
Hyperparameter: Similarity: - The dataset is split into 80% for training and 20% for testing.
Difference: - Number of trees & features
Evaluation metric: - Accuracy - Precision - Recall - F1-score - AUC-ROC
rf_model1 <- randomForest(y ~ ., data=train_data, ntree=100, mtry=3)
rf_predictions1 <- predict(rf_model1, test_data)
# Evaluate
evaluate_model(rf_model1, test_data, rf_predictions1, "Random Forest (ntree=100, mtry=3)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Model: Random Forest (ntree=100, mtry=3)
## Accuracy: 0.9049884
## Precision: 0.922648
## Recall: 0.9740731
## F1 Score: 0.9476634
## AUC: 0.6786165
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7777 652
## yes 207 405
##
## Accuracy : 0.905
## 95% CI : (0.8988, 0.911)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1.459e-11
##
## Kappa : 0.4371
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9741
## Specificity : 0.3832
## Pos Pred Value : 0.9226
## Neg Pred Value : 0.6618
## Prevalence : 0.8831
## Detection Rate : 0.8602
## Detection Prevalence : 0.9323
## Balanced Accuracy : 0.6786
##
## 'Positive' Class : no
##
rf_model2 <- randomForest(y ~ ., data=train_data, ntree=200, mtry=5)
rf_predictions2 <- predict(rf_model2, test_data)
# Evaluate
evaluate_model(rf_model2, test_data, rf_predictions2, "Random Forest (ntree=200, mtry=5)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Model: Random Forest (ntree=200, mtry=5)
## Accuracy: 0.9058732
## Precision: 0.930269
## Recall: 0.9658066
## F1 Score: 0.9477048
## AUC: 0.709488
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7711 578
## yes 273 479
##
## Accuracy : 0.9059
## 95% CI : (0.8997, 0.9118)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 2.119e-12
##
## Kappa : 0.4789
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9658
## Specificity : 0.4532
## Pos Pred Value : 0.9303
## Neg Pred Value : 0.6370
## Prevalence : 0.8831
## Detection Rate : 0.8529
## Detection Prevalence : 0.9168
## Balanced Accuracy : 0.7095
##
## 'Positive' Class : no
##
The random forest model demonstrates improved performance when using a greater number of trees and features compared to the model with default hyperparameters. However, this improvement does not extend to recall. This indicates that increasing the number of trees and features can enhance the overall performance of the model, with the exception of recall.
Hypothesis: Increasing the number of base learners can enhance the accuracy of the predictions.
Hyperparameter: Similarity: - The dataset is split into 80% for training and 20% for testing.
Difference: - Number of base learners
Evaluation metric: - Accuracy - Precision - Recall - F1-score - AUC-ROC
# Train AdaBoost Model
adaboost_model1 <- boosting(y ~ ., data=train_data, boos=TRUE, mfinal=50)
# Make Predictions
adaboost_predictions1 <- predict(adaboost_model1, test_data)$class
# Ensure predicted factor levels match the test data
adaboost_predictions1 <- factor(adaboost_predictions1, levels=levels(test_data$y))
# Evaluate
evaluate_model(adaboost_model1, test_data, adaboost_predictions1, "AdaBoost (mfinal=50)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Model: AdaBoost (mfinal=50)
## Accuracy: 0.9068687
## Precision: 0.9342169
## Recall: 0.9622996
## F1 Score: 0.9480503
## AUC: 0.7252368
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7683 541
## yes 301 516
##
## Accuracy : 0.9069
## 95% CI : (0.9007, 0.9128)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 2.192e-13
##
## Kappa : 0.4997
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9623
## Specificity : 0.4882
## Pos Pred Value : 0.9342
## Neg Pred Value : 0.6316
## Prevalence : 0.8831
## Detection Rate : 0.8498
## Detection Prevalence : 0.9096
## Balanced Accuracy : 0.7252
##
## 'Positive' Class : no
##
# Train AdaBoost Model
adaboost_model2 <- boosting(y ~ ., data=train_data, boos=TRUE, mfinal=100)
# Make Predictions
adaboost_predictions2 <- predict(adaboost_model2, test_data)$class
# Ensure predicted factor levels match the test data
adaboost_predictions2 <- factor(adaboost_predictions2, levels=levels(test_data$y))
# Evaluate
evaluate_model(adaboost_model2, test_data, adaboost_predictions2, "AdaBoost (mfinal=100)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Model: AdaBoost (mfinal=100)
## Accuracy: 0.9060945
## Precision: 0.9339496
## Recall: 0.9616733
## F1 Score: 0.9476088
## AUC: 0.7239776
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7678 543
## yes 306 514
##
## Accuracy : 0.9061
## 95% CI : (0.8999, 0.912)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1.291e-12
##
## Kappa : 0.4962
##
## Mcnemar's Test P-Value : 5.519e-16
##
## Sensitivity : 0.9617
## Specificity : 0.4863
## Pos Pred Value : 0.9339
## Neg Pred Value : 0.6268
## Prevalence : 0.8831
## Detection Rate : 0.8492
## Detection Prevalence : 0.9093
## Balanced Accuracy : 0.7240
##
## 'Positive' Class : no
##
The Adaboost model with an increased number of base learners demonstrates better performance across all evaluation metrics compared to the Adaboost model with default hyperparameters.
After comparing the results of all six experiments, I discovered that increasing the hyperparameters can enhance performance. The best-performing model among the six was the AdaBoost model with increasing base learners, it has balanced performance across all metrics.