In Machine Learning, Experimentation refers to the systematic process of designing, executing, and analyzing different configurations to identify the optimal settings that performs best on a given task. Experimentation is learning by doing. It involves systematically changing parameters, evaluating results with metrics, and comparing different approaches to find the best solution; essentially, it’s the practice of testing and refining machine learning models through controlled experiments to improve their performance.
The key is to modify only one or a few variables at a time to isolate the impact of each change and understand its effect on model performance. In the assignment you will conduct at least 6 experiments. In real life, data scientists run anywhere from a dozen to hundreds of experiments (depending on the dataset and problem domain).
library(tidyverse)
library(dplyr)
library(tidyr)
library(rpart)
library(rpart.plot)
library(lubridate)
library(skimr)
library(stringr)
library(corrplot)
library(ggplot2)
library(fpp3)
library(caret)
library(highcharter)
library(dplyr)
library(randomForest)
library(adabag)
library(ROCR)
library(pROC)
library(knitr)
library(kableExtra)
In this assignment we will be using Bank Marketing Dataset:
A Portuguese bank conducted a marketing campaign (phone calls) to predict if a client will subscribe to a term deposit. The records of their efforts are available in the form of a dataset.Download the Bank Marketing Dataset from: https://archive.ics.uci.edu/dataset/222/bank+marketing
str(bank)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
summary(bank)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
desc_table <- data.frame(
Var = c("age", "job", "marital", "education", "default", "balance",
"housing", "loan", "contact", "day", "month", "duration",
"campaign", "pdays", "previous", "poutcome", "y"),
Desc = c("Age of the client",
"Occupation type",
"Marriage status",
"Highest education level of the client",
"Indicates if there is a credit default",
"Yearly average balance in euros",
"Possession of a housing loan",
"Possession of a personal loan",
"Type of communication contact",
"Day of the last contact",
"Month of the last contact",
"Duration of the last contact in seconds",
"Total number of contacts made during this campaign for the client",
"Days elapsed since the client was last contacted in a previous campaign (-1 means no previous contact)",
"Number of contacts before the current campaign for the client",
"Result of the previous marketing campaign",
"Indicates if the client has subscribed to a term deposit")
)
kable(desc_table, align = "ll", caption = "Description of Variables") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, width = "2in") %>%
column_spec(2, width = "5in")
| Var | Desc |
|---|---|
| age | Age of the client |
| job | Occupation type |
| marital | Marriage status |
| education | Highest education level of the client |
| default | Indicates if there is a credit default |
| balance | Yearly average balance in euros |
| housing | Possession of a housing loan |
| loan | Possession of a personal loan |
| contact | Type of communication contact |
| day | Day of the last contact |
| month | Month of the last contact |
| duration | Duration of the last contact in seconds |
| campaign | Total number of contacts made during this campaign for the client |
| pdays | Days elapsed since the client was last contacted in a previous campaign (-1 means no previous contact) |
| previous | Number of contacts before the current campaign for the client |
| poutcome | Result of the previous marketing campaign |
| y | Indicates if the client has subscribed to a term deposit |
In this section, we will prepapre the data for further analysis.
age_group, balance_group,
long call)# Replace "unknown" with NA
bank <- bank %>% mutate_all(~ifelse(. == "unknown", NA, .))
# Handle missing values
for (col in names(bank)) {
if (is.factor(bank[[col]])) {
mode_val <- names(sort(table(bank[[col]]), decreasing = TRUE))[1]
bank[[col]][is.na(bank[[col]])] <- mode_val
}
}
# Convert categorical variables to factors
bank <- data.frame(lapply(bank, function(x) if(is.character(x)) factor(x) else x))
# Feature Engineering: Creating age_group
bank$age_group <- cut(bank$age, breaks = c(17, 24, 34, 44, 54, 64, 100),
labels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+"))
##Create a new feature based on call duration
bank <- bank %>% mutate(long_call = if_else(duration > median(duration, na.rm = TRUE), "yes", "no"))
# Feature Engineering: Creating balance_group (income_group)
bank$balance_group <- ifelse(bank$balance <= 500, "low",
ifelse(bank$balance <= 2000, "medium", "high"))
# Convert new features to factors
bank$age_group <- as.factor(bank$age_group)
bank$balance_group <- as.factor(bank$balance_group)
bank$long_call <- as.factor(bank$long_call)
#Remove remaining rows with any NA values to avoid errors
bank <- na.omit(bank)
Check for any remaining NA values
print(summary(bank))
## age job marital education default
## Min. :18.00 management :1753 divorced: 887 primary :1012 no :7786
## 1st Qu.:32.00 blue-collar:1537 married :4501 secondary:4197 yes: 56
## Median :38.00 technician :1289 single :2454 tertiary :2633
## Mean :40.78 admin. :1057
## 3rd Qu.:47.00 services : 682
## Max. :89.00 retired : 458
## (Other) :1066
## balance housing loan contact day
## Min. :-1884 no :2900 no :6753 cellular :7257 Min. : 1.00
## 1st Qu.: 162 yes:4942 yes:1089 telephone: 585 1st Qu.: 7.00
## Median : 595 Median :14.00
## Mean : 1552 Mean :14.26
## 3rd Qu.: 1734 3rd Qu.:20.00
## Max. :81204 Max. :31.00
##
## month duration campaign pdays
## may :2436 Min. : 5.0 Min. : 1.000 Min. : 1.0
## nov :1093 1st Qu.: 113.0 1st Qu.: 1.000 1st Qu.:133.0
## apr :1075 Median : 194.0 Median : 2.000 Median :195.0
## feb : 881 Mean : 261.3 Mean : 2.064 Mean :223.3
## aug : 493 3rd Qu.: 324.0 3rd Qu.: 2.000 3rd Qu.:326.0
## jan : 472 Max. :2219.0 Max. :16.000 Max. :871.0
## (Other):1392
## previous poutcome y age_group long_call
## Min. : 1.000 failure:4679 no :6056 18-24: 157 no :3652
## 1st Qu.: 1.000 other :1750 yes:1786 25-34:2602 yes:4190
## Median : 2.000 success:1413 35-44:2592
## Mean : 3.184 45-54:1456
## 3rd Qu.: 4.000 55-64: 769
## Max. :275.000 65+ : 266
##
## balance_group
## high :1725
## low :3584
## medium:2533
##
##
##
##
Prepared Data
head(bank,10)
## age job marital education default balance housing loan contact
## 24061 33 admin. married tertiary no 882 no no telephone
## 24063 42 admin. single secondary no -247 yes yes telephone
## 24065 33 services married secondary no 3444 yes no telephone
## 24073 36 management married tertiary no 2415 yes no telephone
## 24078 36 management married tertiary no 0 yes no telephone
## 24087 44 blue-collar married secondary no 1324 yes no telephone
## 24123 26 technician single tertiary no 172 no yes telephone
## 24128 51 admin. single secondary no 3132 no no telephone
## 24152 33 unemployed divorced secondary no 1005 yes no telephone
## 24166 30 admin. married secondary no 873 yes no telephone
## day month duration campaign pdays previous poutcome y age_group
## 24061 21 oct 39 1 151 3 failure no 25-34
## 24063 21 oct 519 1 166 1 other yes 35-44
## 24065 21 oct 144 1 91 4 failure yes 25-34
## 24073 22 oct 73 1 86 4 other no 35-44
## 24078 23 oct 140 1 143 3 failure yes 35-44
## 24087 25 oct 119 1 89 2 other no 35-44
## 24123 4 nov 21 1 140 4 other no 25-34
## 24128 5 nov 449 1 176 1 failure no 45-54
## 24152 10 nov 175 1 174 2 failure no 25-34
## 24166 12 nov 119 1 167 3 success no 25-34
## long_call balance_group
## 24061 no medium
## 24063 yes low
## 24065 no high
## 24073 no high
## 24078 no low
## 24087 no medium
## 24123 no low
## 24128 yes high
## 24152 no medium
## 24166 no medium
Slit the data (70% training, 30% testing) and check the distribution of target variable in both sets
# Slit the data (70% training, 30% testing)
trainIndex <- createDataPartition(bank$y, p = 0.7, list = FALSE)
trainData <- bank[trainIndex, ]
testData <- bank[-trainIndex, ]
# Check the distribution of target variable in both sets
prop.table(table(trainData$y))
##
## no yes
## 0.7721726 0.2278274
prop.table(table(testData$y))
##
## no yes
## 0.7724373 0.2275627
Now that our data is prepared and organized, we are ready to begin the model implementation process.
Within machine learning decision trees stand out as widely used and comprehensible algorithms for classification and regression tasks. The basic decision tree model delivers essential insights into tree construction which establishes a base template for customization through specific data and goals.
Hypothesis: A simple decision tree with default parameters will provide acceptable performance
dt_model1 <- rpart(y ~ ., data = trainData, method = "class")
rpart.plot(dt_model1, main="Default Decision Tree Model")
The default decision tree model predicts the outcome of a marketing campaign based on two primary variables: the month of the last contact and the duration of that contact (in seconds).
The root node splits the data based on the ‘poutcome’ variable (result of the previous marketing campaign). If ‘poutcome’ is ‘failure’ or ‘other’, the model proceeds down one branch; otherwise, it follows a different path.
Subsequent splits are made based on the ‘month’ and ‘duration’ variables. Each leaf node represents a predicted probability of a positive outcome (represented by the number in the box). For example, if the ‘month’ is in {apr, dec, feb, jan, may, nov} and the ‘duration’ is less than 166 seconds, the model predicts a 54% probability of a positive outcome. The percentages in each leaf node represent the proportion of positive outcomes observed in the training data that fell into that specific leaf node.
The model is relatively simple, using only two variables and a series of binary splits to make predictions. The percentages at the leaf nodes provide a measure of confidence in the prediction.
pred_dt1 <- predict(dt_model1, testData, type = "class")
cm_dt1 <- confusionMatrix(pred_dt1, testData$y)
acc_dt1 <- cm_dt1$overall["Accuracy"]
cat("Decision Tree Experiment 1 (Default): Accuracy =", cm_dt1$overall["Accuracy"], "\n")
## Decision Tree Experiment 1 (Default): Accuracy = 0.8379413
The result shows that our model has a predictive accuracy of 84% against the test data.
# Make predictions
baseline_pred <- predict(dt_model1, testData, type = "class")
# Evaluate
baseline_conf_matrix <- confusionMatrix(baseline_pred, testData$y, positive = "yes")
baseline_conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1669 234
## yes 147 301
##
## Accuracy : 0.8379
## 95% CI : (0.8224, 0.8526)
## No Information Rate : 0.7724
## P-Value [Acc > NIR] : 2.176e-15
##
## Kappa : 0.511
##
## Mcnemar's Test P-Value : 1.053e-05
##
## Sensitivity : 0.5626
## Specificity : 0.9191
## Pos Pred Value : 0.6719
## Neg Pred Value : 0.8770
## Prevalence : 0.2276
## Detection Rate : 0.1280
## Detection Prevalence : 0.1906
## Balanced Accuracy : 0.7408
##
## 'Positive' Class : yes
##
# Calculate ROC and AUC
baseline_prob <- predict(dt_model1, testData, type = "prob")[, "yes"]
baseline_roc <- roc(testData$y, baseline_prob)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
baseline_auc <- auc(baseline_roc)
# Plot ROC curve
plot(baseline_roc, main = paste("ROC Curve - Baseline Model (AUC =", round(baseline_auc, 3), ")"), col = "blue")
The default decision tree model shows moderate performance. While the accuracy is reasonably high, the sensitivity is relatively low - 56.26%, indicating potential issues in correctly identifying positive cases. The Kappa statistic (Kappa : 0.511) suggests only moderate agreement between the model’s predictions and the actual values. Further analysis and potentially model refinement might be necessary to improve performance, particularly in correctly identifying positive cases.
Decision tree models require hyperparameter tuning since it influences their generalization capabilities for unseen data. A decision tree without proper tuning risks overfitting data noise or underfitting to miss the core data patterns. There are various techniques to tune hyperparameters. For our experiment we will implement the Grid search technique and cross-validation to find the best model. Hyperparameter tuning improves decision tree models through the management of the bias-variance trade-off. Hyperparameter tuning is essential because it helps find the tree complexity that reduces variance and keeps bias low which results in stable and accurate predictive models. Hyperparameter tuning seeks to discover the ideal configuration for decision tree models that maximizes their predictive performance.
Hypothesis: Optimizing complexity parameters will improve model performance
# Define hyperparameter grid
param_grid <- expand.grid(
cp = seq(0.001, 0.02, by = 0.002) # complexity parameter
)
# Set up cross-validation
train_control <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Train model with grid search
tuned_model <- train(
y ~ .,
data = trainData,
method = "rpart",
trControl = train_control,
tuneGrid = param_grid,
metric = "ROC"
)
# Print results
print(tuned_model)
## CART
##
## 5491 samples
## 19 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4393, 4393, 4393, 4393, 4392
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.001 0.8607035 0.9115566 0.5763347
## 0.003 0.8253934 0.9193396 0.5811410
## 0.005 0.8153736 0.9228774 0.5827219
## 0.007 0.7904627 0.9332547 0.5411570
## 0.009 0.7691149 0.9349057 0.5180526
## 0.011 0.7496026 0.9377358 0.5020526
## 0.013 0.7356059 0.9464623 0.4796526
## 0.015 0.7175375 0.9474057 0.4724526
## 0.017 0.7175375 0.9474057 0.4724526
## 0.019 0.7175375 0.9474057 0.4724526
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.001.
plot(tuned_model)
The graph assists in finding the decision tree model’s complexity parameter which produces peak predictive performance. The cross-validation results suggest that the best parameter value is around 0.002.
Beyond this point, increasing complexity will lead to a decrease in the ROC score. This means that there’s an optimal level of complexity for the decision tree model. Too simple a model will underfit, and too complex a model will overfit the data, which will result in lower performance measured by the ROC score. The optimal complexity parameter is where the ROC score is maximized, indicating the best balance between model complexity and predictive accuracy.
# Best model
best_model <- tuned_model$finalModel
# Visualize the best tree
rpart.plot(best_model, main = "Optimized Decision Tree")
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# Make predictions
tuned_pred <- predict(tuned_model, testData, type = "raw")
# Evaluate
tuned_conf_matrix <- confusionMatrix(tuned_pred, testData$y, positive = "yes")
tuned_conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1675 231
## yes 141 304
##
## Accuracy : 0.8418
## 95% CI : (0.8264, 0.8563)
## No Information Rate : 0.7724
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5215
##
## Mcnemar's Test P-Value : 3.942e-06
##
## Sensitivity : 0.5682
## Specificity : 0.9224
## Pos Pred Value : 0.6831
## Neg Pred Value : 0.8788
## Prevalence : 0.2276
## Detection Rate : 0.1293
## Detection Prevalence : 0.1893
## Balanced Accuracy : 0.7453
##
## 'Positive' Class : yes
##
acc_dt2 <- tuned_conf_matrix$overall["Accuracy"]
cat("Decision Tree with Hyperparameter Tuning: Accuracy =", acc_dt2["Accuracy"], "\n")
## Decision Tree with Hyperparameter Tuning: Accuracy = 0.8417695
# Calculate ROC and AUC
tuned_prob <- predict(tuned_model, testData, type = "prob")[, "yes"]
tuned_roc <- roc(testData$y, tuned_prob)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
tuned_auc <- auc(tuned_roc)
# Plot ROC curve
plot(tuned_roc, main = paste("ROC Curve - Baseline Model (AUC =", round(tuned_auc, 3), ")"), col = "green")
The model shows good overall accuracy (84.18%), significantly better than the default model, but with room for improvement in sensitivity (56.82%). The high specificity (92.24%) suggests the model is good at identifying negative cases.
Building an effective decision tree model requires feature selection to ensure optimal performance across multiple applications. Targeting only the most relevant features lets the model improve its predictions and avoid overfitting.
Hypothesis: Selecting the most important features will improve model performance
# Feature importance from baseline model
importance <- dt_model1$variable.importance
top_features <- names(importance)[1:10] # Select top 10 features
top_features
## [1] "poutcome" "duration" "month" "long_call" "pdays" "day"
## [7] "age" "balance" "age_group" "campaign"
# Create new dataset with selected features
train_data_selected <- trainData[, c(top_features, "y")]
test_data_selected <- testData[, c(top_features, "y")]
# Train model with selected features
feature_model <- rpart(y ~ ., data = train_data_selected, method = "class")
# Visualize the tree
rpart.plot(feature_model, main = "Decision Tree with Selected Features")
# Make predictions
feature_pred <- predict(feature_model, test_data_selected, type = "class")
# Evaluate
feature_conf_matrix <- confusionMatrix(feature_pred, test_data_selected$y, positive = "yes")
feature_conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1669 234
## yes 147 301
##
## Accuracy : 0.8379
## 95% CI : (0.8224, 0.8526)
## No Information Rate : 0.7724
## P-Value [Acc > NIR] : 2.176e-15
##
## Kappa : 0.511
##
## Mcnemar's Test P-Value : 1.053e-05
##
## Sensitivity : 0.5626
## Specificity : 0.9191
## Pos Pred Value : 0.6719
## Neg Pred Value : 0.8770
## Prevalence : 0.2276
## Detection Rate : 0.1280
## Detection Prevalence : 0.1906
## Balanced Accuracy : 0.7408
##
## 'Positive' Class : yes
##
acc_dt3 <- feature_conf_matrix$overall["Accuracy"]
cat("Decision Tree Experiment 1 (Default): Accuracy =", acc_dt3["Accuracy"], "\n")
## Decision Tree Experiment 1 (Default): Accuracy = 0.8379413
# Calculate ROC and AUC
feature_prob <- predict(feature_model, test_data_selected, type = "prob")[, "yes"]
feature_roc <- roc(test_data_selected$y, feature_prob)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
feature_auc <- auc(feature_roc)
# Plot ROC curve
plot(feature_roc, main = paste("ROC Curve - Feature Selection Model (AUC =", round(feature_auc, 3), ")"), col = "red")
The model has an accuracy of 83.79%, with a 95% confidence interval of (0.8224, 0.8526). The Kappa statistic is 0.511, indicating moderate agreement. Sensitivity is 56.26% and specificity is 91.91%.
The decision tree model with selected features shows moderate predictive performance, with good specificity but lower sensitivity. The accuracy is reasonably high, but the Kappa statistic suggests that the model’s performance is only moderately better than random chance.
# Compile results from all experiments
results <- data.frame(
Model = c("Baseline", "Feature Selection", "Hyperparameter Tuning"),
Accuracy = c(
baseline_conf_matrix$overall["Accuracy"],
feature_conf_matrix$overall["Accuracy"],
tuned_conf_matrix$overall["Accuracy"]
),
Sensitivity = c(
baseline_conf_matrix$byClass["Sensitivity"],
feature_conf_matrix$byClass["Sensitivity"],
tuned_conf_matrix$byClass["Sensitivity"]
),
Specificity = c(
baseline_conf_matrix$byClass["Specificity"],
feature_conf_matrix$byClass["Specificity"],
tuned_conf_matrix$byClass["Specificity"]
),
F1_Score = c(
baseline_conf_matrix$byClass["F1"],
feature_conf_matrix$byClass["F1"],
tuned_conf_matrix$byClass["F1"]
),
AUC = c(baseline_auc, feature_auc, tuned_auc)
)
# Display results table
print(results)
## Model Accuracy Sensitivity Specificity F1_Score AUC
## 1 Baseline 0.8379413 0.5626168 0.9190529 0.6124110 0.7907036
## 2 Feature Selection 0.8379413 0.5626168 0.9190529 0.6124110 0.7907036
## 3 Hyperparameter Tuning 0.8417695 0.5682243 0.9223568 0.6204082 0.8554757
# Visualize comparison of metrics
metrics_long <- reshape2::melt(results, id.vars = "Model")
ggplot(metrics_long, aes(x = Model, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Performance Comparison Across All Models",
y = "Score",
fill = "Metric") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot ROC curves for all models together
plot(baseline_roc, col = "blue", main = "ROC Curves Comparison")
lines(feature_roc, col = "red")
lines(tuned_roc, col = "green")
legend("bottomright", legend = c("Baseline", "Feature Selection", "Tuned"),
col = c("blue", "red", "green"), lwd = 2)
The experiments focused on enhancing model performance through the application of feature selection methods combined with hyperparameter tuning strategies. Hyperparameter tuning produced the best results with an accuracy that exceeded the baseline and feature selection models by a small margin. The Hyperparameter Tuning model achieved the best result in terms of AUC value which reached 0.8554757. The image’s ROC curve demonstrates how the Hyperparameter Tuning model maintains higher true positive rates across different false positive rates compared to its counterparts.
The Hyperparameter Tuning model shows the best performance with an accuracy of 0.8417695 and an AUC of 0.8554757.
The Random Forest algorithm stands as an essential component in the machine learning toolkit. The Random Forest algorithm achieves an optimal mix of accuracy and interpretability while maintaining flexibility which positions it as an excellent tool for research purposes and practical applications. The complexity and growing volume of data will increase the relevance of algorithms such as Random Forests which will drive insights and decision-making in multiple domains.
The default settings for hyperparameters deliver accurate predictions while their functionality remains easy to interpret. A classifier will reduce its chance of overfitting when the forest contains sufficient numbers of trees.
Hypothesis: A baseline Random Forest with default settings will provide acceptable performance
rf_model1 <- randomForest(y ~ ., data = trainData)
pred_rf1 <- predict(rf_model1, testData)
cm_rf1 <- confusionMatrix(pred_rf1, testData$y)
accuracy_rf1 <- cm_rf1$overall["Accuracy"]
cat("Random Forest Experiment 1 (Default): Accuracy =", accuracy_rf1, "\n")
## Random Forest Experiment 1 (Default): Accuracy = 0.85453
plot(rf_model1, main="Random Forest (Default) OOB Error")
varImpPlot(rf_model1, main="Random Forest (Default) Variable Importance")
This initial graph represents how the out-of-bag (OOB) error rate varies based on the number of trees in the random forest. At first, the error rate reduces as the number of trees grows before reaching a stable point which shows that additional trees do not offer substantial accuracy improvement beyond this threshold. The small variations within the plateau region demonstrate the natural randomness present throughout the model’s training process.
The second graph demonstrates variable importance within the random forest model. This plot displays the average reduction in Gini impurity for every variable. When MeanDecreaseGini values rise on the y-axis, they signify higher importance for outcome prediction. The random forest model graph indicates that ‘duration’, ‘poutcome’, ‘month’, ‘pdays’, ‘balance’, and ‘job’ are the most significant variables. The precise sequence of variable importance relies on how MeanDecreaseGini values are scaled.
By default, the number of decision trees used is 100. This number can be adjusted depending on the computational resources available and the complexity of the problem. A larger number of trees should provide better accuracy but will require more computational power.
Hypothesis: Larger number of trees will provide better model performance
rf_model2 <- randomForest(y ~ ., data = trainData, ntree = 200)
pred_rf2 <- predict(rf_model2, testData)
cm_rf2 <- confusionMatrix(pred_rf2, testData$y)
accuracy_rf2 <- cm_rf2$overall["Accuracy"]
cat("Random Forest Experiment 2 (ntree = 200): Accuracy =", accuracy_rf2, "\n")
## Random Forest Experiment 2 (ntree = 200): Accuracy = 0.8507018
plot(rf_model2, main="Random Forest (ntree = 200) OOB Error")
varImpPlot(rf_model2, main="Random Forest (ntree = 200) Variable Importance")
The graph demonstrates a decrease in the OOB error rate with more trees until it reaches a plateau which shows that additional trees beyond this point offer minimal performance benefits. The model demonstrates good performance as indicated by its low and consistent OOB error.
The second graph demonstrates that ‘duration’, ‘poutcome’, ‘month’, ‘pdays’, ‘balance’, and ‘job’ are the most significant variables which is the same that we saw in our first experiment.
# Convert confusion matrices to data frames for ggplot2
cm_df1 <- as.data.frame(cm_rf1$table)
cm_df2 <- as.data.frame(cm_rf2$table)
# Add a model identifier
cm_df1$Model <- "Default RF"
cm_df2$Model <- "RF with 200 trees"
# Combine the data frames
combined_cm_df <- rbind(cm_df1, cm_df2)
# Plotting the confusion matrices
ggplot(data = combined_cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 1) +
facet_wrap(~Model) +
scale_fill_gradient(low = "white", high = "steelblue") +
theme_minimal() +
labs(title = "Confusion Matrix Comparison",
x = "Actual",
y = "Predicted",
fill = "Frequency")
# Create a data frame for comparing accuracies
comparison_df <- data.frame(
Model = c("Default RF", "RF with 200 trees"),
Accuracy = c(accuracy_rf1, accuracy_rf2)
)
print(comparison_df)
## Model Accuracy
## 1 Default RF 0.8545300
## 2 RF with 200 trees 0.8507018
# Visualization: Accuracy Comparison Bar Plot
ggplot(data = comparison_df, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Accuracy, 4)), vjust = -0.3) +
ylim(0, 1) + # Assuming accuracy is between 0 and 1
theme_minimal() +
labs(title = "Accuracy Comparison",
x = "Model",
y = "Accuracy")
The default random forest model has a slightly higher accuracy (0.8541) compared to the model with 200 trees (0.8533).
In this section we will evaluate the model’s performance and the impact of increasing the number of boosting iterations using Adaboost technique. AdaBoost stands for Adaptive Boosting and functions as a machine learning algorithm which uses ensemble learning to join multiple weak learners into a strong predictive model that delivers high accuracy.
Hypothesis: Evaluate the baseline performance of AdaBoost on bank marketing data.
ada_model1 <- boosting(y ~ ., data = trainData, boos = TRUE, mfinal = 50)
pred_ada1 <- predict.boosting(ada_model1, newdata = testData)
cm_ada1 <- confusionMatrix(as.factor(pred_ada1$class), testData$y)
acc_ada1 <- cm_ada1$overall["Accuracy"]
cat("Adaboost Experiment 1 (mfinal = 50): Accuracy =", cm_ada1$overall["Accuracy"], "\n")
## Adaboost Experiment 1 (mfinal = 50): Accuracy = 0.8464483
# Calculate error evolution
evol_ada1 <- errorevol(ada_model1, newdata = testData)
# Plot error evolution with the desired title
plot(evol_ada1$error, type="l", ylim=c(0,max(evol_ada1$error)+0.05), main="AdaBoost (mfinal = 50) Error Evolution", xlab="Iterations", ylab="Error", col = "red")
Our initial experiment ran AdaBoost with its default settings for 50 iterations (mfinal = 50). The accuracy achieved is 0.8464483. The error evolution graph demonstrates a consistent error rate near 0.15 across all 50 iterations.
Experiment 2: AdaBoost with increased number of iterations
AdaBoost’s predictive accuracy improves with additional iterations because its design prevents overfitting. AdaBoost uses more iterations to improve weak classifiers into powerful ones through iterative learning from their errors.
Hypothesis: The model’s accuracy will improve when the number of iterations increases.
ada_model2 <- boosting(y ~ ., data = trainData, boos = TRUE, mfinal = 100)
pred_ada2 <- predict.boosting(ada_model2, newdata = testData)
cm_ada2 <- confusionMatrix(as.factor(pred_ada2$class), testData$y)
acc_ada2 <- cm_ada2$overall["Accuracy"]
cat("Adaboost Experiment 2 (mfinal = 100): Accuracy =", cm_ada2$overall["Accuracy"], "\n")
## Adaboost Experiment 2 (mfinal = 100): Accuracy = 0.8468737
# Calculate error evolution
evol_ada2 <- errorevol(ada_model2, newdata = testData)
# Plot error evolution with the desired title
plot(evol_ada2$error, type="l", ylim=c(0,max(evol_ada2$error)+0.05), main="AdaBoost (mfinal = 100) Error Evolution", xlab="Iterations", ylab="Error", col = "red")
Both graphs show a similar trend: The error rate shows an initial reduction followed by fluctuations near a steady value. The graph with mfinal = 100 consistently demonstrates a somewhat reduced error rate during the initial iterations compared to its mfinal = 50 counterpart. However, the difference is not substantial. The ultimate accuracy measurement for mfinal equal to 100 stands at 0.8468737.
Machine Learning experimentation involves a systematic method for designing and implementing different configurations to assess and identify optimal settings for a given task. This method represents learning by doing since it involves practical application to gain knowledge. The approach requires systematic parameter adjustments followed by outcome measurements with designated metrics and then comparing different methods to establish the best solution. The process entails testing machine learning models in structured experiments while systematically improving them to achieve maximum performance efficiency.
I conducted multiple experiments by applying Decision Tree models, Random Forest algorithms, and Adaptive Boosting methods during this assignment. The Bank Marketing Dataset served as the basis for analysis. The dataset contains records from a phone call marketing campaign that aimed to determine whether clients would choose term deposits.
Decision Tree
Decision trees serve as crucial tools in machine learning and data analysis because they combine clarity with ease of interpretation while needing little data preprocessing. Among machine learning algorithms decision trees stand out as popular models that offer simple interpretation for classification and regression tasks.
Experiment 1: Default Decision Tree
I implemented a basic experiment using a Fundamental decision tree model which operates on default parameters. This fundamental model serves as a beneficial starting point because its simplicity aids in understanding and explaining it which makes it an ideal introduction to complex machine learning theories and models. A basic decision tree model delivers important knowledge about tree building while forming a core structure that can be adjusted to meet specific data needs and goals. The default model stands as the basic representation of decision trees because it includes the most elementary tree structure and avoids advanced methods like pruning or ensemble techniques.
Hypothesis 1: Basic decision trees using default parameters achieve acceptable performance levels
I expect that a basic decision tree model using standard parameters will deliver satisfactory results. The default decision tree model forecasts the results of a marketing campaign based on two key variables: The model uses two main variables which include the month when the final contact happened and the length of that final contact measured in seconds. Data division at the root node is based on the ‘poutcome’ variable reflecting prior marketing campaign results. The model takes one branch when ‘poutcome’ equals ‘failure’ or ‘other’ and follows another branch for all other cases.
The model performs additional splits by examining both ‘month’ and ‘duration’ variables. A leaf node displays the model’s estimated probability of achieving a positive result through the displayed number. When the ‘month’ is part of {apr, dec, feb, jan, may, nov} and the ‘duration’ measures fewer than 166 seconds the model forecasts a 54% chance for a positive outcome. The displayed percentages in each leaf node represent the share of positive training data outcomes that matched the specific leaf node.
The predictive model uses only two input variables and binary splits to operate. The values found in the leaf nodes function as indicators for how confident the model is about each prediction.
Overall, the default decision tree model’s initial results show moderate effectiveness. The model achieves high accuracy despite its poor sensitivity rate of 56.26% which points to difficulties in detecting positive cases. The model’s predictions demonstrate only moderate alignment with actual outcomes as shown by the Kappa statistic of 0.511.
Experiment 2: Decision Tree with Hyperparameter Tuning
I decided to test a Decision Tree Model with Hyperparameter Tuning since the default model only showed moderate performance.
The default model achieves high accuracy but its low sensitivity suggests challenges in detecting positive cases correctly. The Kappa statistic showcases only a moderate level of agreement between predicted and actual results. Hyperparameter tuning is crucial for improving model performance because it helps the model better recognize positive cases. The tuning process aims to achieve the best tree complexity by minimizing variance alongside maintaining low bias to produce more accurate and dependable predictions.
Hypothesis 2: Optimizing complexity parameters will improve model performance
The second model’s hypothesis suggested that better performance results from increased complexity parameters. Through my experiment I deployed Grid Search together with cross-validation to determine the best model configuration. According to cross-validation results, about 0.002 represents the best parameter value. The ROC score declined when complexity was increased beyond this specific threshold. The experiment results demonstrate that decision tree models achieve the best performance at a specific optimal complexity level. A model that fails to capture underlying patterns may underfit while one that captures noise instead of patterns may overfit the data both resulting in degraded performance as indicated by the ROC score. The point of maximum ROC score indicates the optimal complexity parameter which represents the best compromise between model complexity and predictive performance.
The second model showed outstanding overall accuracy reaching 84.18% which represented substantial improvement compared to the baseline model. The model’s sensitivity requires further improvement as it lingers at 56.82%. The model demonstrates high proficiency in identifying negative cases through its 92.24% specificity rate.
Experiment 3: Decision Tree with Feature Selection
Hypothesis 3: The selection of essential features leads to better model performance.
Hyperparameter tuning serves to optimize model performance by adjusting its parameters. The model undergoes feature selection as a method for improvement which involves eliminating unnecessary features that might reduce its performance. The strategic combination produces enhanced accuracy along with improved predictive capabilities. I am convinced that choosing this particular feature model will enhance the performance of the existing model.
The third model failed to deliver better results compared to the tuned model. The model reached an accuracy level of 83.79% and displayed a 95% confidence interval from 0.8224 to 0.8526. The Kappa statistic demonstrates a moderate agreement value of 0.511 with sensitivity and specificity measured at 56.26% and 91.91% respectively.
The focus of my experiments was to elevate the performance of a simple decision tree model by employing feature selection techniques along with hyperparameter tuning strategies. Hyperparameter tuning produced superior results which slightly outperformed both the baseline model and the feature selection approach. The model optimized through hyperparameter tuning reached the superior AUC score of 0.8554757. This model demonstrates superior performance through higher true positive rates at different false positive rates when compared to alternative models as shown by the ROC curve.
Random forest
Random forest
Machine learning practitioners must include the Random Forest algorithm in their essential toolset. Random Forest combines high accuracy and interpretability with versatile capabilities making it a powerful asset for both research and practical use cases. The increasing complexity and data volume will magnify the importance of algorithms such as Random Forests which will be instrumental in generating insights and guiding decisions across various fields.
Experiment 1: Default Random Forest
The default random forest model was my choice for the initial experiment. The default hyperparameter settings provide accurate predictions alongside easy interpretability of their functionality. The risk of overfitting for a classifier declines as the forest includes enough trees.
Hypothesis 1: A Random Forest classifier operating with default settings is expected to achieve reliable performance levels.
As the number of trees grows the out-of-bag (OOB) error rate declines until it stabilizes at approximately 0.1 according to the graph’s results. The trend shows that performance improves with the addition of trees but experiences diminishing returns after a specific threshold. On the test dataset the model achieves an accuracy rate of approximately 85.45%.
The results affirm the hypothesis that the default configuration of a baseline Random Forest model produces satisfactory outcomes because of its relatively high accuracy.
The test data evaluation shows that Random Forest model achieves an accuracy rate of 85.45%. According to the OOB error graph more trees improve model performance but the enhancement diminishes past a certain limit.
Experiment 2: Random Forest with increased number of trees
The baseline Random Forest model typically uses 100 decision trees. The number of trees in the Random Forest model can be modified to match available computational resources and the complexity of the problem. The accuracy of results improves when more trees are used but this approach demands higher computational power.
Hypothesis: The model performance improves with an increased number of trees.
The model uses a Random Forest algorithm with 200 trees for evaluation. The out-of-bag error record starts high but shows rapid decline with added trees showing enhanced model performance. The error rate levels off after adding about 20 trees which indicates that further increasing the number of trees produces minimal improvements. The final OOB error of about 0.1 indicates both a low rate of error and strong predictive accuracy. The model demonstrates strong performance by achieving an accuracy score of 0.8507018 which means it correctly predicts outcomes in about 85% of cases.
After training with 200 trees the model achieved good performance with an accuracy close to 85% and maintained a low OOB error rate of about 0.1. After reaching 20 trees the error rate stops decreasing which indicates further tree additions offer limited advantages. The variable importance plot identifies the main features that influence the model’s predictive outcomes.
Unfortunaltely, the model performance did not show improvement with the addition of more trees.
Adaptive boosting
AdaBoost functions as a machine learning algorithm which combines multiple weak learners through ensemble learning to build a strong predictive model that yields high accuracy.
Experiment 1: Default AdaBoost using the adabag package
Hypothesis 1: Using default AdaBoost will provide acceptable performance.
The experiment uses AdaBoost in its default configuration with 50 iterations (mfinal set to 50). The accuracy achieved is 0.8464483. The error evolution graph displays a consistent error rate of approximately 0.15 during all 50 iterations.
Experiment 2: AdaBoost with increased number of iterations
The predictive accuracy of AdaBoost models improves when the number of iterations increases because the algorithm prevents overfitting. By adding more iterations AdaBoost can correct errors made by weak classifiers thereby converting them into effective strong classifiers.
Hypothesis 2: Model accuracy improves when the number of iterations increases.
For the second experiment I decided to increase the boosting iteration count to 100 by setting mfinal to 100. The modification led to a small accuracy improvement that reached a score of 0.8468737.
The model demonstrated a slight improvement in accuracy and error rate when the boosting iterations were raised from 50 to 100. The reduction in error remains small since the performance improvement is not significant. The model’s effectiveness appears to reach a maximum after a specific number of iterations and additional increases beyond this point likely produce minimal improvements. The ideal iteration count for this dataset and model should be determined through further analysis.
Summary
#Create summary table
results_comprehensive <- data.frame(
Model= c("Decision Tree (Default)", "Decision Tree with tuning", "Decision Tree with feature selection",
"Random_Forest (Default)", "Random_Forest (ntree = 200)",
"Adaptive_Boosting (mfinal = 50)", "Adaptive_Boosting (mfinal = 100)"),
Accuracy = c(baseline_conf_matrix$overall["Accuracy"],
tuned_conf_matrix$overall["Accuracy"],
feature_conf_matrix$overall["Accuracy"],
cm_rf1$overall["Accuracy"],
cm_rf2$overall["Accuracy"],
cm_ada1$overall["Accuracy"],
cm_ada2$overall["Accuracy"]),
Sensitivity = c(baseline_conf_matrix$byClass["Sensitivity"],
tuned_conf_matrix$byClass["Sensitivity"],
feature_conf_matrix$byClass["Sensitivity"],
cm_rf1$byClass["Sensitivity"],
cm_rf2$byClass["Sensitivity"],
cm_ada1$byClass["Sensitivity"],
cm_ada2$byClass["Sensitivity"]),
Specificity = c(baseline_conf_matrix$byClass["Specificity"],
tuned_conf_matrix$byClass["Specificity"],
feature_conf_matrix$byClass["Specificity"],
cm_rf1$byClass["Specificity"],
cm_rf2$byClass["Specificity"],
cm_ada1$byClass["Specificity"],
cm_ada2$byClass["Specificity"])
)
kable(results_comprehensive, "html", caption = "Comprehensive Model Performance Comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
column_spec(2:4, color = "white", background = "SteelBlue")
| Model | Accuracy | Sensitivity | Specificity |
|---|---|---|---|
| Decision Tree (Default) | 0.8379413 | 0.5626168 | 0.9190529 |
| Decision Tree with tuning | 0.8417695 | 0.5682243 | 0.9223568 |
| Decision Tree with feature selection | 0.8379413 | 0.5626168 | 0.9190529 |
| Random_Forest (Default) | 0.8545300 | 0.9207048 | 0.6299065 |
| Random_Forest (ntree = 200) | 0.8507018 | 0.9196035 | 0.6168224 |
| Adaptive_Boosting (mfinal = 50) | 0.8464483 | 0.9190529 | 0.6000000 |
| Adaptive_Boosting (mfinal = 100) | 0.8468737 | 0.9218062 | 0.5925234 |
The default Random Forest model demonstrates optimal accuracy (0.8545) and sensitivity (0.9207) yet shows a relatively low specificity (0.6299). While Decision Trees and their tuned variants exhibit better specificity they demonstrate significantly lower sensitivity compared to other models. AdaBoost models achieve high sensitivity but exhibit the lowest specificity among all evaluated models.
Data scientists should examine Random Forest models to enhance specificity while maintaining their high sensitivity levels. Researchers can enhance specificity through additional hyperparameter optimization and by modifying class weights to emphasize false negatives. Business scenarios with expensive false positives may benefit more from decision trees’ higher specificity compared to random forests’ higher sensitivity.
The selection of the best model for solving the business problem depends primarily on the comparative costs associated with false positive and false negative outcomes. I think, the Random Forest model should be chosen when the goal is to maximize positive case detection even at the expense of increasing false positives. When minimizing false positives becomes a priority the optimal choice could be either a tuned decision tree or modifying the Random Forest classification threshold.
library(tidyverse)
library(dplyr)
library(tidyr)
library(rpart)
library(rpart.plot)
library(lubridate)
library(skimr)
library(stringr)
library(corrplot)
library(ggplot2)
library(fpp3)
library(caret)
library(highcharter)
library(dplyr)
library(randomForest)
library(adabag)
library(ROCR)
library(pROC)
library(knitr)
library(kableExtra)
set.seed(123)
bank<- read.csv("https://raw.githubusercontent.com/uplotnik/DATA-622/refs/heads/main/bank-full.csv",sep=";")
str(bank)
summary(bank)
desc_table <- data.frame(
Var = c("age", "job", "marital", "education", "default", "balance",
"housing", "loan", "contact", "day", "month", "duration",
"campaign", "pdays", "previous", "poutcome", "y"),
Desc = c("Age of the client",
"Occupation type",
"Marriage status",
"Highest education level of the client",
"Indicates if there is a credit default",
"Yearly average balance in euros",
"Possession of a housing loan",
"Possession of a personal loan",
"Type of communication contact",
"Day of the last contact",
"Month of the last contact",
"Duration of the last contact in seconds",
"Total number of contacts made during this campaign for the client",
"Days elapsed since the client was last contacted in a previous campaign (-1 means no previous contact)",
"Number of contacts before the current campaign for the client",
"Result of the previous marketing campaign",
"Indicates if the client has subscribed to a term deposit")
)
kable(desc_table, align = "ll", caption = "Description of Variables") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, width = "2in") %>%
column_spec(2, width = "5in")
# Replace "unknown" with NA
bank <- bank %>% mutate_all(~ifelse(. == "unknown", NA, .))
# Handle missing values
for (col in names(bank)) {
if (is.factor(bank[[col]])) {
mode_val <- names(sort(table(bank[[col]]), decreasing = TRUE))[1]
bank[[col]][is.na(bank[[col]])] <- mode_val
}
}
# Convert categorical variables to factors
bank <- data.frame(lapply(bank, function(x) if(is.character(x)) factor(x) else x))
# Feature Engineering: Creating age_group
bank$age_group <- cut(bank$age, breaks = c(17, 24, 34, 44, 54, 64, 100),
labels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+"))
##Create a new feature based on call duration
bank <- bank %>% mutate(long_call = if_else(duration > median(duration, na.rm = TRUE), "yes", "no"))
# Feature Engineering: Creating balance_group (income_group)
bank$balance_group <- ifelse(bank$balance <= 500, "low",
ifelse(bank$balance <= 2000, "medium", "high"))
# Convert new features to factors
bank$age_group <- as.factor(bank$age_group)
bank$balance_group <- as.factor(bank$balance_group)
bank$long_call <- as.factor(bank$long_call)
#Remove remaining rows with any NA values to avoid errors
bank <- na.omit(bank)
print(summary(bank))
head(bank,10)
# Slit the data (70% training, 30% testing)
trainIndex <- createDataPartition(bank$y, p = 0.7, list = FALSE)
trainData <- bank[trainIndex, ]
testData <- bank[-trainIndex, ]
# Check the distribution of target variable in both sets
prop.table(table(trainData$y))
prop.table(table(testData$y))
dt_model1 <- rpart(y ~ ., data = trainData, method = "class")
rpart.plot(dt_model1, main="Default Decision Tree Model")
pred_dt1 <- predict(dt_model1, testData, type = "class")
cm_dt1 <- confusionMatrix(pred_dt1, testData$y)
acc_dt1 <- cm_dt1$overall["Accuracy"]
cat("Decision Tree Experiment 1 (Default): Accuracy =", cm_dt1$overall["Accuracy"], "\n")
# Make predictions
baseline_pred <- predict(dt_model1, testData, type = "class")
# Evaluate
baseline_conf_matrix <- confusionMatrix(baseline_pred, testData$y, positive = "yes")
baseline_conf_matrix
# Calculate ROC and AUC
baseline_prob <- predict(dt_model1, testData, type = "prob")[, "yes"]
baseline_roc <- roc(testData$y, baseline_prob)
baseline_auc <- auc(baseline_roc)
# Plot ROC curve
plot(baseline_roc, main = paste("ROC Curve - Baseline Model (AUC =", round(baseline_auc, 3), ")"), col = "blue")
# Define hyperparameter grid
param_grid <- expand.grid(
cp = seq(0.001, 0.02, by = 0.002) # complexity parameter
)
# Set up cross-validation
train_control <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary
)
# Train model with grid search
tuned_model <- train(
y ~ .,
data = trainData,
method = "rpart",
trControl = train_control,
tuneGrid = param_grid,
metric = "ROC"
)
# Print results
print(tuned_model)
plot(tuned_model)
# Best model
best_model <- tuned_model$finalModel
# Visualize the best tree
rpart.plot(best_model, main = "Optimized Decision Tree")
# Make predictions
tuned_pred <- predict(tuned_model, testData, type = "raw")
# Evaluate
tuned_conf_matrix <- confusionMatrix(tuned_pred, testData$y, positive = "yes")
tuned_conf_matrix
acc_dt2 <- tuned_conf_matrix$overall["Accuracy"]
cat("Decision Tree with Hyperparameter Tuning: Accuracy =", acc_dt2["Accuracy"], "\n")
# Calculate ROC and AUC
tuned_prob <- predict(tuned_model, testData, type = "prob")[, "yes"]
tuned_roc <- roc(testData$y, tuned_prob)
tuned_auc <- auc(tuned_roc)
# Plot ROC curve
plot(tuned_roc, main = paste("ROC Curve - Baseline Model (AUC =", round(tuned_auc, 3), ")"), col = "green")
# Feature importance from baseline model
importance <- dt_model1$variable.importance
top_features <- names(importance)[1:10] # Select top 10 features
top_features
# Create new dataset with selected features
train_data_selected <- trainData[, c(top_features, "y")]
test_data_selected <- testData[, c(top_features, "y")]
# Train model with selected features
feature_model <- rpart(y ~ ., data = train_data_selected, method = "class")
# Visualize the tree
rpart.plot(feature_model, main = "Decision Tree with Selected Features")
# Make predictions
feature_pred <- predict(feature_model, test_data_selected, type = "class")
# Evaluate
feature_conf_matrix <- confusionMatrix(feature_pred, test_data_selected$y, positive = "yes")
feature_conf_matrix
acc_dt3 <- feature_conf_matrix$overall["Accuracy"]
cat("Decision Tree Experiment 1 (Default): Accuracy =", acc_dt3["Accuracy"], "\n")
# Calculate ROC and AUC
feature_prob <- predict(feature_model, test_data_selected, type = "prob")[, "yes"]
feature_roc <- roc(test_data_selected$y, feature_prob)
feature_auc <- auc(feature_roc)
# Plot ROC curve
plot(feature_roc, main = paste("ROC Curve - Feature Selection Model (AUC =", round(feature_auc, 3), ")"), col = "red")
# Compile results from all experiments
results <- data.frame(
Model = c("Baseline", "Feature Selection", "Hyperparameter Tuning"),
Accuracy = c(
baseline_conf_matrix$overall["Accuracy"],
feature_conf_matrix$overall["Accuracy"],
tuned_conf_matrix$overall["Accuracy"]
),
Sensitivity = c(
baseline_conf_matrix$byClass["Sensitivity"],
feature_conf_matrix$byClass["Sensitivity"],
tuned_conf_matrix$byClass["Sensitivity"]
),
Specificity = c(
baseline_conf_matrix$byClass["Specificity"],
feature_conf_matrix$byClass["Specificity"],
tuned_conf_matrix$byClass["Specificity"]
),
F1_Score = c(
baseline_conf_matrix$byClass["F1"],
feature_conf_matrix$byClass["F1"],
tuned_conf_matrix$byClass["F1"]
),
AUC = c(baseline_auc, feature_auc, tuned_auc)
)
# Display results table
print(results)
# Visualize comparison of metrics
metrics_long <- reshape2::melt(results, id.vars = "Model")
ggplot(metrics_long, aes(x = Model, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Performance Comparison Across All Models",
y = "Score",
fill = "Metric") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot ROC curves for all models together
plot(baseline_roc, col = "blue", main = "ROC Curves Comparison")
lines(feature_roc, col = "red")
lines(tuned_roc, col = "green")
legend("bottomright", legend = c("Baseline", "Feature Selection", "Tuned"),
col = c("blue", "red", "green"), lwd = 2)
rf_model1 <- randomForest(y ~ ., data = trainData)
pred_rf1 <- predict(rf_model1, testData)
cm_rf1 <- confusionMatrix(pred_rf1, testData$y)
accuracy_rf1 <- cm_rf1$overall["Accuracy"]
cat("Random Forest Experiment 1 (Default): Accuracy =", accuracy_rf1, "\n")
plot(rf_model1, main="Random Forest (Default) OOB Error")
varImpPlot(rf_model1, main="Random Forest (Default) Variable Importance")
rf_model2 <- randomForest(y ~ ., data = trainData, ntree = 200)
pred_rf2 <- predict(rf_model2, testData)
cm_rf2 <- confusionMatrix(pred_rf2, testData$y)
accuracy_rf2 <- cm_rf2$overall["Accuracy"]
cat("Random Forest Experiment 2 (ntree = 200): Accuracy =", accuracy_rf2, "\n")
plot(rf_model2, main="Random Forest (ntree = 200) OOB Error")
varImpPlot(rf_model2, main="Random Forest (ntree = 200) Variable Importance")
# Convert confusion matrices to data frames for ggplot2
cm_df1 <- as.data.frame(cm_rf1$table)
cm_df2 <- as.data.frame(cm_rf2$table)
# Add a model identifier
cm_df1$Model <- "Default RF"
cm_df2$Model <- "RF with 200 trees"
# Combine the data frames
combined_cm_df <- rbind(cm_df1, cm_df2)
# Plotting the confusion matrices
ggplot(data = combined_cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 1) +
facet_wrap(~Model) +
scale_fill_gradient(low = "white", high = "steelblue") +
theme_minimal() +
labs(title = "Confusion Matrix Comparison",
x = "Actual",
y = "Predicted",
fill = "Frequency")
# Create a data frame for comparing accuracies
comparison_df <- data.frame(
Model = c("Default RF", "RF with 200 trees"),
Accuracy = c(accuracy_rf1, accuracy_rf2)
)
print(comparison_df)
# Visualization: Accuracy Comparison Bar Plot
ggplot(data = comparison_df, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Accuracy, 4)), vjust = -0.3) +
ylim(0, 1) + # Assuming accuracy is between 0 and 1
theme_minimal() +
labs(title = "Accuracy Comparison",
x = "Model",
y = "Accuracy")
ada_model1 <- boosting(y ~ ., data = trainData, boos = TRUE, mfinal = 50)
pred_ada1 <- predict.boosting(ada_model1, newdata = testData)
cm_ada1 <- confusionMatrix(as.factor(pred_ada1$class), testData$y)
acc_ada1 <- cm_ada1$overall["Accuracy"]
cat("Adaboost Experiment 1 (mfinal = 50): Accuracy =", cm_ada1$overall["Accuracy"], "\n")
# Calculate error evolution
evol_ada1 <- errorevol(ada_model1, newdata = testData)
# Plot error evolution with the desired title
plot(evol_ada1$error, type="l", ylim=c(0,max(evol_ada1$error)+0.05), main="AdaBoost (mfinal = 50) Error Evolution", xlab="Iterations", ylab="Error", col = "red")
ada_model2 <- boosting(y ~ ., data = trainData, boos = TRUE, mfinal = 100)
pred_ada2 <- predict.boosting(ada_model2, newdata = testData)
cm_ada2 <- confusionMatrix(as.factor(pred_ada2$class), testData$y)
acc_ada2 <- cm_ada2$overall["Accuracy"]
cat("Adaboost Experiment 2 (mfinal = 100): Accuracy =", cm_ada2$overall["Accuracy"], "\n")
# Calculate error evolution
evol_ada2 <- errorevol(ada_model2, newdata = testData)
# Plot error evolution with the desired title
plot(evol_ada2$error, type="l", ylim=c(0,max(evol_ada2$error)+0.05), main="AdaBoost (mfinal = 100) Error Evolution", xlab="Iterations", ylab="Error", col = "red")
#Create summary table
results_comprehensive <- data.frame(
Model= c("Decision Tree (Default)", "Decision Tree with tuning", "Decision Tree with feature selection",
"Random_Forest (Default)", "Random_Forest (ntree = 200)",
"Adaptive_Boosting (mfinal = 50)", "Adaptive_Boosting (mfinal = 100)"),
Accuracy = c(baseline_conf_matrix$overall["Accuracy"],
tuned_conf_matrix$overall["Accuracy"],
feature_conf_matrix$overall["Accuracy"],
cm_rf1$overall["Accuracy"],
cm_rf2$overall["Accuracy"],
cm_ada1$overall["Accuracy"],
cm_ada2$overall["Accuracy"]),
Sensitivity = c(baseline_conf_matrix$byClass["Sensitivity"],
tuned_conf_matrix$byClass["Sensitivity"],
feature_conf_matrix$byClass["Sensitivity"],
cm_rf1$byClass["Sensitivity"],
cm_rf2$byClass["Sensitivity"],
cm_ada1$byClass["Sensitivity"],
cm_ada2$byClass["Sensitivity"]),
Specificity = c(baseline_conf_matrix$byClass["Specificity"],
tuned_conf_matrix$byClass["Specificity"],
feature_conf_matrix$byClass["Specificity"],
cm_rf1$byClass["Specificity"],
cm_rf2$byClass["Specificity"],
cm_ada1$byClass["Specificity"],
cm_ada2$byClass["Specificity"])
)
kable(results_comprehensive, "html", caption = "Comprehensive Model Performance Comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
column_spec(2:4, color = "white", background = "SteelBlue")