Part II: Pre-processing

1.1 Load the csv file and nessary libraries.

# Load Libraries
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
library(pROC)
library(ada)
library(tidyverse)
library(adabag)
library(corrplot)
library(dplyr)
library(knitr)
library(skimr)
library(readr)

# Read data file
df <- read.csv("https://raw.githubusercontent.com/Jennyjjxxzz/HW1/refs/heads/main/bank-full.csv", sep = ";")

head (df)

str(df)

## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : chr  "management" "technician" "entrepreneur" "blue-collar" ...
##  $ marital  : chr  "married" "single" "married" "married" ...
##  $ education: chr  "tertiary" "secondary" "secondary" "unknown" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : chr  "yes" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "no" "yes" "no" ...
##  $ contact  : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr  "may" "may" "may" "may" ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...

1.2 Missing and Duplicated Values

Missing Values: There are large amount of “unknown” in the categorical variable. In this case I will keep the “unknown” value.
The reason why I chose to keep the “unknown” value instead place with NA, because in this dataset “unknown” variables are treated as a separate category, in case there is an underlying pattern with the target variable(y).
I think Tree-Base model and AdaBoost will handle well for categorical variables.
encode categorical variables

# Count NA values
colSums(is.na(df))

##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0

# Check for string "unknown"
sapply(df, function(x) sum(x == "unknown", na.rm = TRUE))

##       age       job   marital education   default   balance   housing      loan 
##         0       288         0      1857         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##     13020         0         0         0         0         0         0     36959 
##         y 
##         0

# Check for duplicated rows
sum(duplicated(df))

## [1] 0

1.3 Convert the variables to factor

cat_cols <- c("job","marital","education","default","housing","loan",
              "contact","month","poutcome","y")
num_cols <- c("age","balance","day","duration","campaign","pdays","previous")

df <- df %>%
  mutate(
    across(all_of(cat_cols), ~ as.factor(.x)),
    across(all_of(num_cols), ~ as.numeric(.x))
  )

Part III: Split the data 80 training/ 20 testing

set.seed(123)
# Slit the data (80% training, 20% testing)
idx   <- createDataPartition(df$y, p = 0.8, list = FALSE)
train <- df[idx, ]
test  <- df[-idx, ]

# Check the distribution of target variable in both sets
prop.table(table(train$y))

## 
##       no      yes 
## 0.882997 0.117003

Part IV: Experiment

3.1 Decision Tree:

Hypothesis: A simple decision tree with default parameters will provide acceptable performance.

Decision Tree Model 1:

In Decision Tree Model 1, I use simple default setting for the model.

set.seed(123)

dt_model1 <- rpart(y ~ ., data = train, method = "class", control = rpart.control(minsplit = 10, cp = 0.01))

rpart.plot(dt_model1,box.palette = "auto", nn = TRUE, main="Default Decision Tree Model")

# Evaluate
pred_dt1 <- predict(dt_model1, test, type = "class")
probs_dt1 <- predict(dt_model1, test, type = "prob")[,"yes"] 
cm_dt1 <- confusionMatrix(pred_dt1, test$y, positive = "yes")

# Extract Precision, Recall, F1
precision_dt1 <- cm_dt1$byClass["Pos Pred Value"]
recall_dt1 <- cm_dt1$byClass["Sensitivity"]
f1_dt1 <- (2 * precision_dt1 * recall_dt1) / (precision_dt1 + recall_dt1)
# ROC/AUC
roc_dt1 <- pROC::roc(response = factor(test$y, levels = c("no", "yes")),predictor = probs_dt1)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

AUC_dt1 <- as.numeric(pROC::auc(roc_dt1))

results_dt1 <- data.frame(
  Model = "Decision Tree - Baseline",
  Accuracy = cm_dt1$overall["Accuracy"],
  Precision = precision_dt1,
  Recall = recall_dt1,
  F1_Score = f1_dt1,
  AUC = AUC_dt1
)
kable(results_dt1)

	Model	Accuracy	Precision	Recall	F1_Score	AUC
Accuracy	Decision Tree - Baseline	0.8977989	0.617284	0.3311258	0.4310345	0.7227281

print(cm_dt1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7767  707
##        yes  217  350
##                                          
##                Accuracy : 0.8978         
##                  95% CI : (0.8914, 0.904)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 5.031e-06      
##                                          
##                   Kappa : 0.3805         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.33113        
##             Specificity : 0.97282        
##          Pos Pred Value : 0.61728        
##          Neg Pred Value : 0.91657        
##              Prevalence : 0.11691        
##          Detection Rate : 0.03871        
##    Detection Prevalence : 0.06271        
##       Balanced Accuracy : 0.65197        
##                                          
##        'Positive' Class : yes            
##

Decision Tree Model 2:

Hypothesis: Tuning and pruning the decision tree using cross-validation will reduce overfitting and improve model generalization.
In Decision Tree Model 2, I setup 5-fold CV.
Using CV to choose higher cp (stronger pruning) will reduce variance and improve generalization (higher AUC/F1) vs default setting of decision tree.

# Cross-validation setup

ctrl_cv <- trainControl(method = "cv", number = 5)

cp values close to 0 (like 0.000 or 0.005) allow the tree to grow very deep, and tests for overfitting scenarios.
cp values up to 0.05 prune the tree quite heavily, and tests for underfitting scenarios.

set.seed(123)
dt_tuned <- train(
  y ~ ., data = train,
  method = "rpart",
  trControl = ctrl_cv,
  tuneGrid = expand.grid(cp = seq(0.000, 0.05, by = 0.005))
)

probs <- predict(dt_tuned, newdata = test, type = "prob")[,"yes"]
preds <- ifelse(probs >= 0.5, "yes", "no")
cm_dt2 <- confusionMatrix(factor(preds, levels = c("yes", "no")), test$y, positive = "yes")

## Warning in confusionMatrix.default(factor(preds, levels = c("yes", "no")), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

roc_dt2 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
                     predictor = probs)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

AUC_dt2 <- as.numeric(pROC::auc(roc_dt2))

# Extract Precision, Recall, F1
precision_dt2 <- cm_dt2$byClass["Pos Pred Value"]
recall_dt2 <- cm_dt1$byClass["Sensitivity"]
f1_dt2 <- (2 * precision_dt2 * recall_dt2) / (precision_dt2 + recall_dt2)

results_dt2 <- data.frame(
  Model = "Decision Tree - Baseline",
  Accuracy = cm_dt1$overall["Accuracy"],
  Precision = precision_dt2,
  Recall = recall_dt2,
  F1_Score = f1_dt2,
  AUC = AUC_dt2
)
kable(results_dt2)

	Model	Accuracy	Precision	Recall	F1_Score	AUC
Accuracy	Decision Tree - Baseline	0.8977989	0.617284	0.3311258	0.4310345	0.7227281

print(cm_dt2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7767  707
##        yes  217  350
##                                          
##                Accuracy : 0.8978         
##                  95% CI : (0.8914, 0.904)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 5.031e-06      
##                                          
##                   Kappa : 0.3805         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.33113        
##             Specificity : 0.97282        
##          Pos Pred Value : 0.61728        
##          Neg Pred Value : 0.91657        
##              Prevalence : 0.11691        
##          Detection Rate : 0.03871        
##    Detection Prevalence : 0.06271        
##       Balanced Accuracy : 0.65197        
##                                          
##        'Positive' Class : yes            
##

The optimal cp value is near the peak of the curve (around 0.005–0.010).

# Visualize the effect of cp
plot(dt_tuned, main = "Decision Tree Hyperparameter Tuning (cp vs Accuracy)")

3.2 Random Forest:

Hypothesis: A baseline Random Forest model with default settings (100 trees) will outperform the single decision tree models in both accuracy and AUC.

Random Forest Model 1 (100 trees)

Standard Random Forest with All Features (100 trees)

set.seed(123)
rf_model1 <- randomForest(
  y ~ ., data = train,
  ntree = 100,
  importance = TRUE,
  num.threads = parallel::detectCores()
)

varImpPlot(rf_model1, main = "Random Forest - Feature Importance")

rf_probs <- predict(rf_model1, newdata = test, type = "prob")[,"yes"]
rf_preds <- ifelse(rf_probs >= 0.5, "yes", "no")
cm_rf1 <- confusionMatrix(factor(rf_preds, levels = c("yes", "no")), test$y, positive = "yes")

## Warning in confusionMatrix.default(factor(rf_preds, levels = c("yes", "no")), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

precision_rf1 <- cm_rf1$byClass["Pos Pred Value"]
recall_rf1 <- cm_rf1$byClass["Sensitivity"]
f1_rf1 <- (2 * precision_rf1 * recall_rf1) / (precision_rf1 + recall_rf1)
roc_rf1 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
                     predictor = rf_probs)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

AUC_rf1 <- as.numeric(pROC::auc(roc_rf1))

results_rf1 <- data.frame(
  Model = "Random Forest Model 1 (100 trees)",
  Accuracy = cm_rf1$overall["Accuracy"],
  Precision = precision_rf1,
  Recall = recall_rf1,
  F1_Score = f1_rf1,
  AUC = AUC_rf1
)
kable(results_rf1)

	Model	Accuracy	Precision	Recall	F1_Score	AUC
Accuracy	Random Forest Model 1 (100 trees)	0.9066475	0.6329588	0.4796594	0.5457481	0.9275931

Random Forest Model 2 (500 trees)

Hypothesis: Increase the number of the tree in the model can improving the model.
In the Random Forest Model 2, I increase the number of the tree up to 500.

set.seed(123)
rf_model2 <- randomForest(
  y ~ ., data = train,
  ntree = 500,
  importance = TRUE,
  num.threads = parallel::detectCores()
)

varImpPlot(rf_model2, main = "Random Forest - Feature Importance")

rf_probs2 <- predict(rf_model2, newdata = test, type = "prob")[,"yes"]
rf_preds2 <- ifelse(rf_probs2 >= 0.5, "yes", "no")
cm_rf2 <- confusionMatrix(factor(rf_preds2, levels = c("yes", "no")), test$y, positive = "yes")

## Warning in confusionMatrix.default(factor(rf_preds2, levels = c("yes", "no")),
## : Levels are not in the same order for reference and data. Refactoring data to
## match.

precision_rf2 <- cm_rf1$byClass["Pos Pred Value"]
recall_rf2 <- cm_rf1$byClass["Sensitivity"]
f1_rf2 <- (2 * precision_rf2 * recall_rf2) / (precision_rf2 + recall_rf2)
roc_rf2 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
                     predictor = rf_probs2)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

AUC_rf2 <- as.numeric(pROC::auc(roc_rf2))

results_rf2 <- data.frame(
  Model = "Random Forest Model 2 (500 trees) ",
  Accuracy = cm_rf1$overall["Accuracy"],
  Precision = precision_rf2,
  Recall = recall_rf2,
  F1_Score = f1_rf2,
  AUC = AUC_rf2
)
kable(results_rf2)

	Model	Accuracy	Precision	Recall	F1_Score	AUC
Accuracy	Random Forest Model 2 (500 trees)	0.9066475	0.6329588	0.4796594	0.5457481	0.9299513

Random Forest Model 3(tuning)

Hypothesis: Tuning the mtry parameter will further optimize the bias–variance trade-off, improving performance.
In this model I keep the number of tree 500.
Set up the number of features randomly selected at each split. It controls how much randomness there is in the forest.
Smaller mtry is less correlation andlower variance, but possibly higher bias.
Larger mtry is lower bias, but possibly higher variance.

set.seed(123)
rf_tuned <- randomForest(
  y ~ ., data = train,
  method = "rf",
  trControl = ctrl_cv,
  metric = "ROC",
  tuneGrid = expand.grid(mtry = c(2,3,4,5,6,8,10)),
  ntree = 500,
  importance = TRUE,
  num.threads = parallel::detectCores()
)

rf_probs3 <- predict(rf_tuned, newdata = test, type = "prob")[,"yes"]
rf_preds3 <- ifelse(rf_probs3 >= 0.5, "yes", "no")
cm_rf3 <- confusionMatrix(factor(rf_preds3, levels = c("yes", "no")), test$y, positive = "yes")

## Warning in confusionMatrix.default(factor(rf_preds3, levels = c("yes", "no")),
## : Levels are not in the same order for reference and data. Refactoring data to
## match.

precision_rf3 <- cm_rf2$byClass["Pos Pred Value"]
recall_rf3 <- cm_rf2$byClass["Sensitivity"]
f1_rf3 <- (2 * precision_rf3 * recall_rf3) / (precision_rf3 + recall_rf3)
roc_rf3 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
                     predictor = rf_probs)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

AUC_rf3 <- as.numeric(pROC::auc(roc_rf3))

results_rf3 <- data.frame(
  Model = "Random Forest - Tuned",
  Accuracy = cm_rf3$overall["Accuracy"],
  Precision = precision_rf3,
  Recall = recall_rf3,
  F1_Score = f1_rf3,
  AUC = AUC_rf3
)
kable(results_rf3)

	Model	Accuracy	Precision	Recall	F1_Score	AUC
Accuracy	Random Forest - Tuned	0.9055414	0.6292994	0.4673605	0.5363735	0.9275931

3.3 AdaBoost:

Adaboost Model 1:

Base Adaboost Model setup with low mfinal = 10.

set.seed(123)

# Train the Adaboost model
ada_model1 <- boosting(y ~ ., data = train,
                  boos = TRUE, mfinal = 10)

# Make predictions on the test set
pred_ab <- predict(ada_model1, newdata = test)
pred_ab_prob <- pred_ab$prob[,2]
pred_ab_class <- factor(ifelse(pred_ab_prob > 0.5, "yes", "no"), levels = levels(test$y)) 

conf_matrax_ab <- confusionMatrix(pred_ab_class, test$y)
roc_ab <- roc(ifelse(test$y == "yes", 1, 0), pred_ab_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

auc_ab_control <- auc(roc_ab)

print(conf_matrax_ab)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7708  604
##        yes  276  453
##                                           
##                Accuracy : 0.9027          
##                  95% CI : (0.8964, 0.9087)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 1.576e-09       
##                                           
##                   Kappa : 0.4553          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9654          
##             Specificity : 0.4286          
##          Pos Pred Value : 0.9273          
##          Neg Pred Value : 0.6214          
##              Prevalence : 0.8831          
##          Detection Rate : 0.8526          
##    Detection Prevalence : 0.9194          
##       Balanced Accuracy : 0.6970          
##                                           
##        'Positive' Class : no              
##

# Extract accuracy, precision, recall
accuracy_ab  <- conf_matrax_ab$overall["Accuracy"]
precision_ab <- conf_matrax_ab$byClass["Pos Pred Value"]
recall_ab    <- conf_matrax_ab$byClass["Sensitivity"]
roc_ab <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
                    predictor = pred_ab_prob)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

AUC_ab <- as.numeric(pROC::auc(roc_ab))

# Compute F1 Score manually
f1_ab <- (2 * precision_ab * recall_ab) / (precision_ab + recall_ab)

# Combine metrics into a clean table
results_ab <- data.frame(
  Model     = "AdaBoost (adabag)",
  Accuracy  = accuracy_ab,
  Precision = precision_ab,
  Recall    = recall_ab,
  F1_Score  = f1_ab,
  AUC       = AUC_ab
)

kable(results_ab, caption = "AdaBoost Model 1")

AdaBoost Model 1
	Model	Accuracy	Precision	Recall	F1_Score	AUC
Accuracy	AdaBoost (adabag)	0.9026656	0.927334	0.9654309	0.945999	0.9108971

Adaboost Model 2:

Increase the mfinal to 100, more weak learner may have higher capacity.

set.seed(123)

# Train the Adaboost model
ada_model2 <- boosting(y ~ ., data = train,
                  boos = TRUE, mfinal = 100,
                  control = rpart.control(cp = 0.0001,
                                          minsplit = 3))

# Make predictions on the test set
pred_ab2 <- predict(ada_model2, newdata = test)
pred_ab_prob2 <- pred_ab2$prob[,2]
pred_ab_class2 <- factor(ifelse(pred_ab_prob2 > 0.5, "yes", "no"), levels = levels(test$y)) 

conf_matrax_ab2 <- confusionMatrix(pred_ab_class2, test$y)
roc_ab2 <- roc(ifelse(test$y == "yes", 1, 0), pred_ab_prob2)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

auc_ab_control2 <- auc(roc_ab2)

print(conf_matrax_ab2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7700  574
##        yes  284  483
##                                           
##                Accuracy : 0.9051          
##                  95% CI : (0.8989, 0.9111)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 1.151e-11       
##                                           
##                   Kappa : 0.4783          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9644          
##             Specificity : 0.4570          
##          Pos Pred Value : 0.9306          
##          Neg Pred Value : 0.6297          
##              Prevalence : 0.8831          
##          Detection Rate : 0.8517          
##    Detection Prevalence : 0.9152          
##       Balanced Accuracy : 0.7107          
##                                           
##        'Positive' Class : no              
##

# Extract accuracy, precision, recall
accuracy_ab2  <- conf_matrax_ab2$overall["Accuracy"]
precision_ab2 <- conf_matrax_ab2$byClass["Pos Pred Value"]
recall_ab2    <- conf_matrax_ab2$byClass["Sensitivity"]
roc_ab2 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
                    predictor = pred_ab_prob2)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

AUC_ab2 <- as.numeric(pROC::auc(roc_ab2))
# Compute F1 Score manually
f1_ab2 <- (2 * precision_ab * recall_ab) / (precision_ab + recall_ab)

# Combine metrics into a clean table
results_ab2 <- data.frame(
  Model     = "AdaBoost Model 2 - Tuned",
  Accuracy  = accuracy_ab2,
  Precision = precision_ab2,
  Recall    = recall_ab2,
  F1_Score  = f1_ab2,
  AUC       = AUC_ab2
)

kable(results_ab2, caption = "AdaBoost Model Performance Summary")

AdaBoost Model Performance Summary
	Model	Accuracy	Precision	Recall	F1_Score	AUC
Accuracy	AdaBoost Model 2 - Tuned	0.905099	0.9306261	0.9644289	0.945999	0.9253121

Final Result Table

# Standardize each results table
standardize_results <- function(df) {
  stopifnot(is.data.frame(df))
  
  # Normalize metric names
  names(df) <- gsub("^F1$", "F1_Score", names(df), ignore.case = TRUE)
  names(df) <- gsub("^F1.Score$", "F1_Score", names(df), ignore.case = TRUE)
  names(df) <- gsub("^AOC$", "AUC", names(df), ignore.case = TRUE)
  names(df) <- gsub("^auc$", "AUC", names(df), ignore.case = TRUE)
  names(df) <- gsub("^precision$", "Precision", names(df), ignore.case = TRUE)
  names(df) <- gsub("^recall$", "Recall", names(df), ignore.case = TRUE)
  names(df) <- gsub("^accuracy$", "Accuracy", names(df), ignore.case = TRUE)
  names(df) <- gsub("^model$", "Model", names(df), ignore.case = TRUE)

  wanted <- c("Model", "Accuracy", "Precision", "Recall", "F1_Score", "AUC")
  
  # Coerce types & select only the wanted columns in order
  df %>%
    mutate(
      Model     = as.character(Model),
      Accuracy  = as.numeric(Accuracy),
      Precision = as.numeric(Precision),
      Recall    = as.numeric(Recall),
      F1_Score  = as.numeric(F1_Score),
      AUC       = as.numeric(AUC)
    ) %>%
    select(all_of(wanted))
}

# Collect the result data frames
res_names <- c(
  "results_dt1","results_dt2",
  "results_rf1","results_rf2","results_rf3",
  "results_ab","results_ab2"
)

res_list <- mget(res_names, ifnotfound = list(NULL), inherits = TRUE)
res_list <- Filter(Negate(is.null), res_list)

# Standardize
all_results <- res_list %>%
  lapply(standardize_results) %>%
  bind_rows() %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  mutate(
    Model = c(
      "Decision Tree - Model 1",
      "Decision Tree - Model 2 Tuned",
      "Random Forest - Model 1_100 Trees",
      "Random Forest - Model 2_500 Trees",
      "Random Forest - Model 3_Tuned",
      "AdaBoost - Model 1_10 Estimators",
      "AdaBoost - Model 2_100 Estimators"
    )
  )

knitr::kable(all_results, caption = "Model Comparison: Accuracy, Precision, Recall, F1, AUC")

Model Comparison: Accuracy, Precision, Recall, F1, AUC
	Model	Accuracy	Precision	Recall	F1_Score	AUC
Accuracy…1	Decision Tree - Model 1	0.8978	0.6173	0.3311	0.4310	0.7227
Accuracy…2	Decision Tree - Model 2 Tuned	0.8978	0.6173	0.3311	0.4310	0.7227
Accuracy…3	Random Forest - Model 1_100 Trees	0.9066	0.6330	0.4797	0.5457	0.9276
Accuracy…4	Random Forest - Model 2_500 Trees	0.9066	0.6330	0.4797	0.5457	0.9300
Accuracy…5	Random Forest - Model 3_Tuned	0.9055	0.6293	0.4674	0.5364	0.9276
Accuracy…6	AdaBoost - Model 1_10 Estimators	0.9027	0.9273	0.9654	0.9460	0.9109
Accuracy…7	AdaBoost - Model 2_100 Estimators	0.9051	0.9306	0.9644	0.9460	0.9253

Conclusion:

Among all tested algorithms, AdaBoost Model 2 with 100 estimators delivered the best overall performance, striking an excellent balance between precision, recall, and AUC. Random Forest models were a strong second choice.

Data 622 Machine Learning and Big Data_HW2

Jiaxin Zheng

2025-10-19

Part I

1.1 Assignment Introduction