This assignment focuses on one of the most important aspects of data science, Exploratory Data Analysis (EDA). Many surveys show that data scientists spend 60-80% of their time on data preparation. EDA allows you to identify data gaps & data imbalances, improve data quality, create better features and gain a deep understanding of your data before doing model training - and that ultimately helps train better models. In machine learning, there is a saying - “better data beats better algorithms” - meaning that it is more productive to spend time improving data quality than improving the code to train the model.
# Load Libraries
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
library(pROC)
library(ada)
library(tidyverse)
library(adabag)
library(corrplot)
library(dplyr)
library(knitr)
library(skimr)
library(readr)
# Read data file
df <- read.csv("https://raw.githubusercontent.com/Jennyjjxxzz/HW1/refs/heads/main/bank-full.csv", sep = ";")
head (df)
str(df)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
# Count NA values
colSums(is.na(df))
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
# Check for string "unknown"
sapply(df, function(x) sum(x == "unknown", na.rm = TRUE))
## age job marital education default balance housing loan
## 0 288 0 1857 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 13020 0 0 0 0 0 0 36959
## y
## 0
# Check for duplicated rows
sum(duplicated(df))
## [1] 0
cat_cols <- c("job","marital","education","default","housing","loan",
"contact","month","poutcome","y")
num_cols <- c("age","balance","day","duration","campaign","pdays","previous")
df <- df %>%
mutate(
across(all_of(cat_cols), ~ as.factor(.x)),
across(all_of(num_cols), ~ as.numeric(.x))
)
set.seed(123)
# Slit the data (80% training, 20% testing)
idx <- createDataPartition(df$y, p = 0.8, list = FALSE)
train <- df[idx, ]
test <- df[-idx, ]
# Check the distribution of target variable in both sets
prop.table(table(train$y))
##
## no yes
## 0.882997 0.117003
set.seed(123)
dt_model1 <- rpart(y ~ ., data = train, method = "class", control = rpart.control(minsplit = 10, cp = 0.01))
rpart.plot(dt_model1,box.palette = "auto", nn = TRUE, main="Default Decision Tree Model")
# Evaluate
pred_dt1 <- predict(dt_model1, test, type = "class")
probs_dt1 <- predict(dt_model1, test, type = "prob")[,"yes"]
cm_dt1 <- confusionMatrix(pred_dt1, test$y, positive = "yes")
# Extract Precision, Recall, F1
precision_dt1 <- cm_dt1$byClass["Pos Pred Value"]
recall_dt1 <- cm_dt1$byClass["Sensitivity"]
f1_dt1 <- (2 * precision_dt1 * recall_dt1) / (precision_dt1 + recall_dt1)
# ROC/AUC
roc_dt1 <- pROC::roc(response = factor(test$y, levels = c("no", "yes")),predictor = probs_dt1)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
AUC_dt1 <- as.numeric(pROC::auc(roc_dt1))
results_dt1 <- data.frame(
Model = "Decision Tree - Baseline",
Accuracy = cm_dt1$overall["Accuracy"],
Precision = precision_dt1,
Recall = recall_dt1,
F1_Score = f1_dt1,
AUC = AUC_dt1
)
kable(results_dt1)
| Model | Accuracy | Precision | Recall | F1_Score | AUC | |
|---|---|---|---|---|---|---|
| Accuracy | Decision Tree - Baseline | 0.8977989 | 0.617284 | 0.3311258 | 0.4310345 | 0.7227281 |
print(cm_dt1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7767 707
## yes 217 350
##
## Accuracy : 0.8978
## 95% CI : (0.8914, 0.904)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 5.031e-06
##
## Kappa : 0.3805
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.33113
## Specificity : 0.97282
## Pos Pred Value : 0.61728
## Neg Pred Value : 0.91657
## Prevalence : 0.11691
## Detection Rate : 0.03871
## Detection Prevalence : 0.06271
## Balanced Accuracy : 0.65197
##
## 'Positive' Class : yes
##
Hypothesis: Tuning and pruning the decision tree using cross-validation will reduce overfitting and improve model generalization.
In Decision Tree Model 2, I setup 5-fold CV.
Using CV to choose higher cp (stronger pruning) will reduce variance and improve generalization (higher AUC/F1) vs default setting of decision tree.
# Cross-validation setup
ctrl_cv <- trainControl(method = "cv", number = 5)
cp values close to 0 (like 0.000 or 0.005) allow the tree to grow very deep, and tests for overfitting scenarios.
cp values up to 0.05 prune the tree quite heavily, and tests for underfitting scenarios.
set.seed(123)
dt_tuned <- train(
y ~ ., data = train,
method = "rpart",
trControl = ctrl_cv,
tuneGrid = expand.grid(cp = seq(0.000, 0.05, by = 0.005))
)
probs <- predict(dt_tuned, newdata = test, type = "prob")[,"yes"]
preds <- ifelse(probs >= 0.5, "yes", "no")
cm_dt2 <- confusionMatrix(factor(preds, levels = c("yes", "no")), test$y, positive = "yes")
## Warning in confusionMatrix.default(factor(preds, levels = c("yes", "no")), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
roc_dt2 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
predictor = probs)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
AUC_dt2 <- as.numeric(pROC::auc(roc_dt2))
# Extract Precision, Recall, F1
precision_dt2 <- cm_dt2$byClass["Pos Pred Value"]
recall_dt2 <- cm_dt1$byClass["Sensitivity"]
f1_dt2 <- (2 * precision_dt2 * recall_dt2) / (precision_dt2 + recall_dt2)
results_dt2 <- data.frame(
Model = "Decision Tree - Baseline",
Accuracy = cm_dt1$overall["Accuracy"],
Precision = precision_dt2,
Recall = recall_dt2,
F1_Score = f1_dt2,
AUC = AUC_dt2
)
kable(results_dt2)
| Model | Accuracy | Precision | Recall | F1_Score | AUC | |
|---|---|---|---|---|---|---|
| Accuracy | Decision Tree - Baseline | 0.8977989 | 0.617284 | 0.3311258 | 0.4310345 | 0.7227281 |
print(cm_dt2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7767 707
## yes 217 350
##
## Accuracy : 0.8978
## 95% CI : (0.8914, 0.904)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 5.031e-06
##
## Kappa : 0.3805
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.33113
## Specificity : 0.97282
## Pos Pred Value : 0.61728
## Neg Pred Value : 0.91657
## Prevalence : 0.11691
## Detection Rate : 0.03871
## Detection Prevalence : 0.06271
## Balanced Accuracy : 0.65197
##
## 'Positive' Class : yes
##
# Visualize the effect of cp
plot(dt_tuned, main = "Decision Tree Hyperparameter Tuning (cp vs Accuracy)")
set.seed(123)
rf_model1 <- randomForest(
y ~ ., data = train,
ntree = 100,
importance = TRUE,
num.threads = parallel::detectCores()
)
varImpPlot(rf_model1, main = "Random Forest - Feature Importance")
rf_probs <- predict(rf_model1, newdata = test, type = "prob")[,"yes"]
rf_preds <- ifelse(rf_probs >= 0.5, "yes", "no")
cm_rf1 <- confusionMatrix(factor(rf_preds, levels = c("yes", "no")), test$y, positive = "yes")
## Warning in confusionMatrix.default(factor(rf_preds, levels = c("yes", "no")), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
precision_rf1 <- cm_rf1$byClass["Pos Pred Value"]
recall_rf1 <- cm_rf1$byClass["Sensitivity"]
f1_rf1 <- (2 * precision_rf1 * recall_rf1) / (precision_rf1 + recall_rf1)
roc_rf1 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
predictor = rf_probs)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
AUC_rf1 <- as.numeric(pROC::auc(roc_rf1))
results_rf1 <- data.frame(
Model = "Random Forest Model 1 (100 trees)",
Accuracy = cm_rf1$overall["Accuracy"],
Precision = precision_rf1,
Recall = recall_rf1,
F1_Score = f1_rf1,
AUC = AUC_rf1
)
kable(results_rf1)
| Model | Accuracy | Precision | Recall | F1_Score | AUC | |
|---|---|---|---|---|---|---|
| Accuracy | Random Forest Model 1 (100 trees) | 0.9066475 | 0.6329588 | 0.4796594 | 0.5457481 | 0.9275931 |
Hypothesis: Increase the number of the tree in the model can improving the model.
In the Random Forest Model 2, I increase the number of the tree up to 500.
set.seed(123)
rf_model2 <- randomForest(
y ~ ., data = train,
ntree = 500,
importance = TRUE,
num.threads = parallel::detectCores()
)
varImpPlot(rf_model2, main = "Random Forest - Feature Importance")
rf_probs2 <- predict(rf_model2, newdata = test, type = "prob")[,"yes"]
rf_preds2 <- ifelse(rf_probs2 >= 0.5, "yes", "no")
cm_rf2 <- confusionMatrix(factor(rf_preds2, levels = c("yes", "no")), test$y, positive = "yes")
## Warning in confusionMatrix.default(factor(rf_preds2, levels = c("yes", "no")),
## : Levels are not in the same order for reference and data. Refactoring data to
## match.
precision_rf2 <- cm_rf1$byClass["Pos Pred Value"]
recall_rf2 <- cm_rf1$byClass["Sensitivity"]
f1_rf2 <- (2 * precision_rf2 * recall_rf2) / (precision_rf2 + recall_rf2)
roc_rf2 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
predictor = rf_probs2)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
AUC_rf2 <- as.numeric(pROC::auc(roc_rf2))
results_rf2 <- data.frame(
Model = "Random Forest Model 2 (500 trees) ",
Accuracy = cm_rf1$overall["Accuracy"],
Precision = precision_rf2,
Recall = recall_rf2,
F1_Score = f1_rf2,
AUC = AUC_rf2
)
kable(results_rf2)
| Model | Accuracy | Precision | Recall | F1_Score | AUC | |
|---|---|---|---|---|---|---|
| Accuracy | Random Forest Model 2 (500 trees) | 0.9066475 | 0.6329588 | 0.4796594 | 0.5457481 | 0.9299513 |
Hypothesis: Tuning the mtry parameter will further optimize the bias–variance trade-off, improving performance.
In this model I keep the number of tree 500.
Set up the number of features randomly selected at each split. It controls how much randomness there is in the forest.
Smaller mtry is less correlation andlower variance, but possibly higher bias.
Larger mtry is lower bias, but possibly higher variance.
set.seed(123)
rf_tuned <- randomForest(
y ~ ., data = train,
method = "rf",
trControl = ctrl_cv,
metric = "ROC",
tuneGrid = expand.grid(mtry = c(2,3,4,5,6,8,10)),
ntree = 500,
importance = TRUE,
num.threads = parallel::detectCores()
)
rf_probs3 <- predict(rf_tuned, newdata = test, type = "prob")[,"yes"]
rf_preds3 <- ifelse(rf_probs3 >= 0.5, "yes", "no")
cm_rf3 <- confusionMatrix(factor(rf_preds3, levels = c("yes", "no")), test$y, positive = "yes")
## Warning in confusionMatrix.default(factor(rf_preds3, levels = c("yes", "no")),
## : Levels are not in the same order for reference and data. Refactoring data to
## match.
precision_rf3 <- cm_rf2$byClass["Pos Pred Value"]
recall_rf3 <- cm_rf2$byClass["Sensitivity"]
f1_rf3 <- (2 * precision_rf3 * recall_rf3) / (precision_rf3 + recall_rf3)
roc_rf3 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
predictor = rf_probs)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
AUC_rf3 <- as.numeric(pROC::auc(roc_rf3))
results_rf3 <- data.frame(
Model = "Random Forest - Tuned",
Accuracy = cm_rf3$overall["Accuracy"],
Precision = precision_rf3,
Recall = recall_rf3,
F1_Score = f1_rf3,
AUC = AUC_rf3
)
kable(results_rf3)
| Model | Accuracy | Precision | Recall | F1_Score | AUC | |
|---|---|---|---|---|---|---|
| Accuracy | Random Forest - Tuned | 0.9055414 | 0.6292994 | 0.4673605 | 0.5363735 | 0.9275931 |
set.seed(123)
# Train the Adaboost model
ada_model1 <- boosting(y ~ ., data = train,
boos = TRUE, mfinal = 10)
# Make predictions on the test set
pred_ab <- predict(ada_model1, newdata = test)
pred_ab_prob <- pred_ab$prob[,2]
pred_ab_class <- factor(ifelse(pred_ab_prob > 0.5, "yes", "no"), levels = levels(test$y))
conf_matrax_ab <- confusionMatrix(pred_ab_class, test$y)
roc_ab <- roc(ifelse(test$y == "yes", 1, 0), pred_ab_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_ab_control <- auc(roc_ab)
print(conf_matrax_ab)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7708 604
## yes 276 453
##
## Accuracy : 0.9027
## 95% CI : (0.8964, 0.9087)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1.576e-09
##
## Kappa : 0.4553
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9654
## Specificity : 0.4286
## Pos Pred Value : 0.9273
## Neg Pred Value : 0.6214
## Prevalence : 0.8831
## Detection Rate : 0.8526
## Detection Prevalence : 0.9194
## Balanced Accuracy : 0.6970
##
## 'Positive' Class : no
##
# Extract accuracy, precision, recall
accuracy_ab <- conf_matrax_ab$overall["Accuracy"]
precision_ab <- conf_matrax_ab$byClass["Pos Pred Value"]
recall_ab <- conf_matrax_ab$byClass["Sensitivity"]
roc_ab <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
predictor = pred_ab_prob)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
AUC_ab <- as.numeric(pROC::auc(roc_ab))
# Compute F1 Score manually
f1_ab <- (2 * precision_ab * recall_ab) / (precision_ab + recall_ab)
# Combine metrics into a clean table
results_ab <- data.frame(
Model = "AdaBoost (adabag)",
Accuracy = accuracy_ab,
Precision = precision_ab,
Recall = recall_ab,
F1_Score = f1_ab,
AUC = AUC_ab
)
kable(results_ab, caption = "AdaBoost Model 1")
| Model | Accuracy | Precision | Recall | F1_Score | AUC | |
|---|---|---|---|---|---|---|
| Accuracy | AdaBoost (adabag) | 0.9026656 | 0.927334 | 0.9654309 | 0.945999 | 0.9108971 |
set.seed(123)
# Train the Adaboost model
ada_model2 <- boosting(y ~ ., data = train,
boos = TRUE, mfinal = 100,
control = rpart.control(cp = 0.0001,
minsplit = 3))
# Make predictions on the test set
pred_ab2 <- predict(ada_model2, newdata = test)
pred_ab_prob2 <- pred_ab2$prob[,2]
pred_ab_class2 <- factor(ifelse(pred_ab_prob2 > 0.5, "yes", "no"), levels = levels(test$y))
conf_matrax_ab2 <- confusionMatrix(pred_ab_class2, test$y)
roc_ab2 <- roc(ifelse(test$y == "yes", 1, 0), pred_ab_prob2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_ab_control2 <- auc(roc_ab2)
print(conf_matrax_ab2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7700 574
## yes 284 483
##
## Accuracy : 0.9051
## 95% CI : (0.8989, 0.9111)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1.151e-11
##
## Kappa : 0.4783
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9644
## Specificity : 0.4570
## Pos Pred Value : 0.9306
## Neg Pred Value : 0.6297
## Prevalence : 0.8831
## Detection Rate : 0.8517
## Detection Prevalence : 0.9152
## Balanced Accuracy : 0.7107
##
## 'Positive' Class : no
##
# Extract accuracy, precision, recall
accuracy_ab2 <- conf_matrax_ab2$overall["Accuracy"]
precision_ab2 <- conf_matrax_ab2$byClass["Pos Pred Value"]
recall_ab2 <- conf_matrax_ab2$byClass["Sensitivity"]
roc_ab2 <- pROC::roc(response = factor(test$y, levels = c("no","yes")),
predictor = pred_ab_prob2)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
AUC_ab2 <- as.numeric(pROC::auc(roc_ab2))
# Compute F1 Score manually
f1_ab2 <- (2 * precision_ab * recall_ab) / (precision_ab + recall_ab)
# Combine metrics into a clean table
results_ab2 <- data.frame(
Model = "AdaBoost Model 2 - Tuned",
Accuracy = accuracy_ab2,
Precision = precision_ab2,
Recall = recall_ab2,
F1_Score = f1_ab2,
AUC = AUC_ab2
)
kable(results_ab2, caption = "AdaBoost Model Performance Summary")
| Model | Accuracy | Precision | Recall | F1_Score | AUC | |
|---|---|---|---|---|---|---|
| Accuracy | AdaBoost Model 2 - Tuned | 0.905099 | 0.9306261 | 0.9644289 | 0.945999 | 0.9253121 |
# Standardize each results table
standardize_results <- function(df) {
stopifnot(is.data.frame(df))
# Normalize metric names
names(df) <- gsub("^F1$", "F1_Score", names(df), ignore.case = TRUE)
names(df) <- gsub("^F1.Score$", "F1_Score", names(df), ignore.case = TRUE)
names(df) <- gsub("^AOC$", "AUC", names(df), ignore.case = TRUE)
names(df) <- gsub("^auc$", "AUC", names(df), ignore.case = TRUE)
names(df) <- gsub("^precision$", "Precision", names(df), ignore.case = TRUE)
names(df) <- gsub("^recall$", "Recall", names(df), ignore.case = TRUE)
names(df) <- gsub("^accuracy$", "Accuracy", names(df), ignore.case = TRUE)
names(df) <- gsub("^model$", "Model", names(df), ignore.case = TRUE)
wanted <- c("Model", "Accuracy", "Precision", "Recall", "F1_Score", "AUC")
# Coerce types & select only the wanted columns in order
df %>%
mutate(
Model = as.character(Model),
Accuracy = as.numeric(Accuracy),
Precision = as.numeric(Precision),
Recall = as.numeric(Recall),
F1_Score = as.numeric(F1_Score),
AUC = as.numeric(AUC)
) %>%
select(all_of(wanted))
}
# Collect the result data frames
res_names <- c(
"results_dt1","results_dt2",
"results_rf1","results_rf2","results_rf3",
"results_ab","results_ab2"
)
res_list <- mget(res_names, ifnotfound = list(NULL), inherits = TRUE)
res_list <- Filter(Negate(is.null), res_list)
# Standardize
all_results <- res_list %>%
lapply(standardize_results) %>%
bind_rows() %>%
mutate(across(where(is.numeric), ~ round(., 4))) %>%
mutate(
Model = c(
"Decision Tree - Model 1",
"Decision Tree - Model 2 Tuned",
"Random Forest - Model 1_100 Trees",
"Random Forest - Model 2_500 Trees",
"Random Forest - Model 3_Tuned",
"AdaBoost - Model 1_10 Estimators",
"AdaBoost - Model 2_100 Estimators"
)
)
knitr::kable(all_results, caption = "Model Comparison: Accuracy, Precision, Recall, F1, AUC")
| Model | Accuracy | Precision | Recall | F1_Score | AUC | |
|---|---|---|---|---|---|---|
| Accuracy…1 | Decision Tree - Model 1 | 0.8978 | 0.6173 | 0.3311 | 0.4310 | 0.7227 |
| Accuracy…2 | Decision Tree - Model 2 Tuned | 0.8978 | 0.6173 | 0.3311 | 0.4310 | 0.7227 |
| Accuracy…3 | Random Forest - Model 1_100 Trees | 0.9066 | 0.6330 | 0.4797 | 0.5457 | 0.9276 |
| Accuracy…4 | Random Forest - Model 2_500 Trees | 0.9066 | 0.6330 | 0.4797 | 0.5457 | 0.9300 |
| Accuracy…5 | Random Forest - Model 3_Tuned | 0.9055 | 0.6293 | 0.4674 | 0.5364 | 0.9276 |
| Accuracy…6 | AdaBoost - Model 1_10 Estimators | 0.9027 | 0.9273 | 0.9654 | 0.9460 | 0.9109 |
| Accuracy…7 | AdaBoost - Model 2_100 Estimators | 0.9051 | 0.9306 | 0.9644 | 0.9460 | 0.9253 |
Among all tested algorithms, AdaBoost Model 2 with 100 estimators delivered the best overall performance, striking an excellent balance between precision, recall, and AUC. Random Forest models were a strong second choice.