# Loading the libraries I will use
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(corrplot)
## corrplot 0.95 loaded
install.packages("DataExplorer", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages
library(DataExplorer)
# Loading the dataset
data <- read.csv("/Users/ursulapodosenin/Desktop/bank-full.csv", sep= ";")
head(data)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no unknown 5
## 2 44 technician single secondary no 29 yes no unknown 5
## 3 33 entrepreneur married secondary no 2 yes yes unknown 5
## 4 47 blue-collar married unknown no 1506 yes no unknown 5
## 5 33 unknown single unknown no 1 no no unknown 5
## 6 35 management married tertiary no 231 yes no unknown 5
## month duration campaign pdays previous poutcome y
## 1 may 261 1 -1 0 unknown no
## 2 may 151 1 -1 0 unknown no
## 3 may 76 1 -1 0 unknown no
## 4 may 92 1 -1 0 unknown no
## 5 may 198 1 -1 0 unknown no
## 6 may 139 1 -1 0 unknown no
# Loading the necessary packages
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
install.packages("rpart",repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages
library(rpart)
install.packages("randomForest",repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
install.packages("adabag",repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages
library(adabag)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
install.packages("pROC",repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Converting the target variable to factor
data$y <- as.factor(data$y)
# Creating a train-test split
set.seed(865)
trainIndex <- createDataPartition(data$y, p = 0.7, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
# Defining the evaluation function
evaluate_model <- function(model, testData) {
predictions <- predict(model, testData, type = "class")
cm <- confusionMatrix(predictions, testData$y)
auc <- roc(as.numeric(testData$y) - 1, as.numeric(predictions) - 1)$auc
return(list(accuracy = cm$overall["Accuracy"], precision = cm$byClass["Precision"],
recall = cm$byClass["Recall"], F1 = cm$byClass["F1"], AUC = auc))
}
# Experiment 1: Decision Tree with default parameters
dt_model1 <- rpart(y ~ ., data = trainData, method = "class")
dt_results1 <- evaluate_model(dt_model1, testData)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Experiment 2: Decision Tree with depth control
dt_model2 <- rpart(y ~ ., data = trainData, method = "class", control = rpart.control(maxdepth = 5))
dt_results2 <- evaluate_model(dt_model2, testData)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Experiment 3: Random Forest with default parameters
rf_model1 <- randomForest(y ~ ., data = trainData, ntree = 100)
rf_results1 <- evaluate_model(rf_model1, testData)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Experiment 4: Random Forest with increased trees
rf_model2 <- randomForest(y ~ ., data = trainData, ntree = 200)
rf_results2 <- evaluate_model(rf_model2, testData)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
evaluate_model_adaboost <- function(model, testData) {
predictions <- predict(model, newdata = testData)
pred_classes <- as.factor(predictions$class) # Convert to factor
# Ensure levels match between prediction and actual values
pred_classes <- factor(pred_classes, levels = levels(testData$y))
cm <- confusionMatrix(pred_classes, testData$y)
auc <- roc(as.numeric(testData$y) - 1, as.numeric(pred_classes) - 1)$auc
return(list(accuracy = cm$overall["Accuracy"], precision = cm$byClass["Precision"],
recall = cm$byClass["Recall"], F1 = cm$byClass["F1"], AUC = auc))
}
# Ensure y is a factor
trainData$y <- factor(trainData$y)
testData$y <- factor(testData$y, levels = levels(trainData$y)) # Ensure same levels
# Experiment 5: Adaboost with default parameters
ab_model1 <- boosting(y ~ ., data = trainData, boos = TRUE, mfinal = 50)
ab_results1 <- evaluate_model_adaboost(ab_model1, testData)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Experiment 6: Adaboost with more iterations
trainData$y <- factor(trainData$y)
testData$y <- factor(testData$y, levels = levels(trainData$y)) # Ensure same levels
evaluate_model_adaboost <- function(model, testData) {
predictions <- predict(model, newdata = testData)
pred_classes <- factor(predictions$class, levels = levels(testData$y)) # Force same levels
cm <- confusionMatrix(pred_classes, testData$y)
auc <- roc(as.numeric(testData$y) - 1, as.numeric(pred_classes) - 1)$auc
return(list(accuracy = cm$overall["Accuracy"], precision = cm$byClass["Precision"],
recall = cm$byClass["Recall"], F1 = cm$byClass["F1"], AUC = auc))
}
# Ensuring the target variable has same factor levels in train and test sets
trainData$y <- factor(trainData$y)
testData$y <- factor(testData$y, levels = levels(trainData$y))
# Running Adaboost with different iterations
ab_model2 <- boosting(y ~ ., data = trainData, boos = TRUE, mfinal = 100)
ab_results2 <- evaluate_model_adaboost(ab_model2, testData)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Storing the results in a data frame
results <- data.frame(
Model = c("Decision Tree (Default)", "Decision Tree (Max Depth=5)",
"Random Forest (100 Trees)", "Random Forest (200 Trees)",
"Adaboost (50 Iterations)", "Adaboost (100 Iterations)"),
Accuracy = c(dt_results1$accuracy, dt_results2$accuracy,
rf_results1$accuracy, rf_results2$accuracy,
ab_results1$accuracy, ab_results2$accuracy),
Precision = c(dt_results1$precision, dt_results2$precision,
rf_results1$precision, rf_results2$precision,
ab_results1$precision, ab_results2$precision),
Recall = c(dt_results1$recall, dt_results2$recall,
rf_results1$recall, rf_results2$recall,
ab_results1$recall, ab_results2$recall),
F1_Score = c(dt_results1$F1, dt_results2$F1,
rf_results1$F1, rf_results2$F1,
ab_results1$F1, ab_results2$F1),
AUC = c(dt_results1$AUC, dt_results2$AUC,
rf_results1$AUC, rf_results2$AUC,
ab_results1$AUC, ab_results2$AUC)
)
results
## Model Accuracy Precision Recall F1_Score AUC
## 1 Decision Tree (Default) 0.9031854 0.9204321 0.9746159 0.9467494 0.6692121
## 2 Decision Tree (Max Depth=5) 0.9031854 0.9204321 0.9746159 0.9467494 0.6692121
## 3 Random Forest (100 Trees) 0.9056924 0.9264811 0.9701904 0.9478321 0.6944268
## 4 Random Forest (200 Trees) 0.9076832 0.9273191 0.9716099 0.9489480 0.6982892
## 5 Adaboost (50 Iterations) 0.9051762 0.9316750 0.9632599 0.9472042 0.7149212
## 6 Adaboost (100 Iterations) 0.9044389 0.9309232 0.9632599 0.9468155 0.7117686
# Installing the neccessary packages
install.packages("e1071", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages
library(e1071)
# Converting the target to a factor
trainData$y <- factor(trainData$y)
testData$y <- factor(testData$y, levels = levels(trainData$y))
# Experiment 7: SVM with radial kernel
svm_model <- svm(y ~ ., data = trainData, kernel = "radial", probability = TRUE)
# Evaluate SVM
evaluate_model_svm <- function(model, testData) {
predictions <- predict(model, testData, probability = TRUE)
probs <- attr(predictions, "probabilities")[,2]
# Convert to factor for confusion matrix
pred_classes <- factor(predictions, levels = levels(testData$y))
cm <- confusionMatrix(pred_classes, testData$y)
auc <- roc(as.numeric(testData$y) - 1, probs)$auc
return(list(accuracy = cm$overall["Accuracy"], precision = cm$byClass["Precision"],
recall = cm$byClass["Recall"], F1 = cm$byClass["F1"], AUC = auc))
}
svm_results <- evaluate_model_svm(svm_model, testData)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Adding the SVM results to the final comparison table
results <- rbind(
results,
data.frame(
Model = "SVM (Radial Kernel)",
Accuracy = svm_results$accuracy,
Precision = svm_results$precision,
Recall = svm_results$recall,
F1_Score = svm_results$F1,
AUC = svm_results$AUC
)
)
results
## Model Accuracy Precision Recall F1_Score
## 1 Decision Tree (Default) 0.9031854 0.9204321 0.9746159 0.9467494
## 2 Decision Tree (Max Depth=5) 0.9031854 0.9204321 0.9746159 0.9467494
## 3 Random Forest (100 Trees) 0.9056924 0.9264811 0.9701904 0.9478321
## 4 Random Forest (200 Trees) 0.9076832 0.9273191 0.9716099 0.9489480
## 5 Adaboost (50 Iterations) 0.9051762 0.9316750 0.9632599 0.9472042
## 6 Adaboost (100 Iterations) 0.9044389 0.9309232 0.9632599 0.9468155
## Accuracy SVM (Radial Kernel) 0.8980976 0.9126032 0.9782899 0.9443056
## AUC
## 1 0.6692121
## 2 0.6692121
## 3 0.6944268
## 4 0.6982892
## 5 0.7149212
## 6 0.7117686
## Accuracy 0.9027437