ML_2

# Loading the libraries I will use
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(dplyr)
library(corrplot)

## corrplot 0.95 loaded

install.packages("DataExplorer", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages

library(DataExplorer)

# Loading the dataset
data <- read.csv("/Users/ursulapodosenin/Desktop/bank-full.csv", sep= ";")
head(data)

##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no unknown   5
## 2  44   technician  single secondary      no      29     yes   no unknown   5
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
## 5  33      unknown  single   unknown      no       1      no   no unknown   5
## 6  35   management married  tertiary      no     231     yes   no unknown   5
##   month duration campaign pdays previous poutcome  y
## 1   may      261        1    -1        0  unknown no
## 2   may      151        1    -1        0  unknown no
## 3   may       76        1    -1        0  unknown no
## 4   may       92        1    -1        0  unknown no
## 5   may      198        1    -1        0  unknown no
## 6   may      139        1    -1        0  unknown no

# Loading the necessary packages

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

install.packages("rpart",repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages

library(rpart)
install.packages("randomForest",repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

install.packages("adabag",repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages

library(adabag)

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: doParallel

## Loading required package: iterators

## Loading required package: parallel

install.packages("pROC",repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Converting the target variable to factor 
data$y <- as.factor(data$y)

# Creating a train-test split
set.seed(865)
trainIndex <- createDataPartition(data$y, p = 0.7, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]

# Defining the evaluation function
evaluate_model <- function(model, testData) {
  predictions <- predict(model, testData, type = "class")
  cm <- confusionMatrix(predictions, testData$y)
  auc <- roc(as.numeric(testData$y) - 1, as.numeric(predictions) - 1)$auc
  return(list(accuracy = cm$overall["Accuracy"], precision = cm$byClass["Precision"], 
              recall = cm$byClass["Recall"], F1 = cm$byClass["F1"], AUC = auc))
}

# Experiment 1: Decision Tree with default parameters
dt_model1 <- rpart(y ~ ., data = trainData, method = "class")
dt_results1 <- evaluate_model(dt_model1, testData)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# Experiment 2: Decision Tree with depth control
dt_model2 <- rpart(y ~ ., data = trainData, method = "class", control = rpart.control(maxdepth = 5))
dt_results2 <- evaluate_model(dt_model2, testData)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# Experiment 3: Random Forest with default parameters
rf_model1 <- randomForest(y ~ ., data = trainData, ntree = 100)
rf_results1 <- evaluate_model(rf_model1, testData)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# Experiment 4: Random Forest with increased trees
rf_model2 <- randomForest(y ~ ., data = trainData, ntree = 200)
rf_results2 <- evaluate_model(rf_model2, testData)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

evaluate_model_adaboost <- function(model, testData) {
  predictions <- predict(model, newdata = testData)
  pred_classes <- as.factor(predictions$class)  # Convert to factor
  
  # Ensure levels match between prediction and actual values
  pred_classes <- factor(pred_classes, levels = levels(testData$y))
  
  cm <- confusionMatrix(pred_classes, testData$y)
  auc <- roc(as.numeric(testData$y) - 1, as.numeric(pred_classes) - 1)$auc
  
  return(list(accuracy = cm$overall["Accuracy"], precision = cm$byClass["Precision"],
              recall = cm$byClass["Recall"], F1 = cm$byClass["F1"], AUC = auc))
}

# Ensure y is a factor
trainData$y <- factor(trainData$y)
testData$y <- factor(testData$y, levels = levels(trainData$y))  # Ensure same levels

# Experiment 5: Adaboost with default parameters
ab_model1 <- boosting(y ~ ., data = trainData, boos = TRUE, mfinal = 50)
ab_results1 <- evaluate_model_adaboost(ab_model1, testData)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# Experiment 6: Adaboost with more iterations
trainData$y <- factor(trainData$y)
testData$y <- factor(testData$y, levels = levels(trainData$y))  # Ensure same levels

evaluate_model_adaboost <- function(model, testData) {
  predictions <- predict(model, newdata = testData)
  pred_classes <- factor(predictions$class, levels = levels(testData$y))  # Force same levels
  
  cm <- confusionMatrix(pred_classes, testData$y)
  auc <- roc(as.numeric(testData$y) - 1, as.numeric(pred_classes) - 1)$auc
  
  return(list(accuracy = cm$overall["Accuracy"], precision = cm$byClass["Precision"],
              recall = cm$byClass["Recall"], F1 = cm$byClass["F1"], AUC = auc))
}

# Ensuring the target variable has same factor levels in train and test sets
trainData$y <- factor(trainData$y)
testData$y <- factor(testData$y, levels = levels(trainData$y))

# Running Adaboost with different iterations
ab_model2 <- boosting(y ~ ., data = trainData, boos = TRUE, mfinal = 100)
ab_results2 <- evaluate_model_adaboost(ab_model2, testData)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# Storing the results in a data frame
results <- data.frame(
  Model = c("Decision Tree (Default)", "Decision Tree (Max Depth=5)",
            "Random Forest (100 Trees)", "Random Forest (200 Trees)",
            "Adaboost (50 Iterations)", "Adaboost (100 Iterations)"),
  Accuracy = c(dt_results1$accuracy, dt_results2$accuracy,
               rf_results1$accuracy, rf_results2$accuracy,
               ab_results1$accuracy, ab_results2$accuracy),
  Precision = c(dt_results1$precision, dt_results2$precision,
                rf_results1$precision, rf_results2$precision,
                ab_results1$precision, ab_results2$precision),
  Recall = c(dt_results1$recall, dt_results2$recall,
             rf_results1$recall, rf_results2$recall,
             ab_results1$recall, ab_results2$recall),
  F1_Score = c(dt_results1$F1, dt_results2$F1,
               rf_results1$F1, rf_results2$F1,
               ab_results1$F1, ab_results2$F1),
  AUC = c(dt_results1$AUC, dt_results2$AUC,
          rf_results1$AUC, rf_results2$AUC,
          ab_results1$AUC, ab_results2$AUC)
)

results

##                         Model  Accuracy Precision    Recall  F1_Score       AUC
## 1     Decision Tree (Default) 0.9031854 0.9204321 0.9746159 0.9467494 0.6692121
## 2 Decision Tree (Max Depth=5) 0.9031854 0.9204321 0.9746159 0.9467494 0.6692121
## 3   Random Forest (100 Trees) 0.9056924 0.9264811 0.9701904 0.9478321 0.6944268
## 4   Random Forest (200 Trees) 0.9076832 0.9273191 0.9716099 0.9489480 0.6982892
## 5    Adaboost (50 Iterations) 0.9051762 0.9316750 0.9632599 0.9472042 0.7149212
## 6   Adaboost (100 Iterations) 0.9044389 0.9309232 0.9632599 0.9468155 0.7117686

# Installing the neccessary packages
install.packages("e1071", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//Rtmp2tdUhh/downloaded_packages

library(e1071)

# Converting the target to a factor
trainData$y <- factor(trainData$y)
testData$y <- factor(testData$y, levels = levels(trainData$y))

# Experiment 7: SVM with radial kernel
svm_model <- svm(y ~ ., data = trainData, kernel = "radial", probability = TRUE)

# Evaluate SVM
evaluate_model_svm <- function(model, testData) {
  predictions <- predict(model, testData, probability = TRUE)
  probs <- attr(predictions, "probabilities")[,2]
  
  # Convert to factor for confusion matrix
  pred_classes <- factor(predictions, levels = levels(testData$y))
  
  cm <- confusionMatrix(pred_classes, testData$y)
  auc <- roc(as.numeric(testData$y) - 1, probs)$auc
  
  return(list(accuracy = cm$overall["Accuracy"], precision = cm$byClass["Precision"],
              recall = cm$byClass["Recall"], F1 = cm$byClass["F1"], AUC = auc))
}

svm_results <- evaluate_model_svm(svm_model, testData)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

# Adding the SVM results to the final comparison table
results <- rbind(
  results,
  data.frame(
    Model = "SVM (Radial Kernel)",
    Accuracy = svm_results$accuracy,
    Precision = svm_results$precision,
    Recall = svm_results$recall,
    F1_Score = svm_results$F1,
    AUC = svm_results$AUC
  )
)

results

##                                Model  Accuracy Precision    Recall  F1_Score
## 1            Decision Tree (Default) 0.9031854 0.9204321 0.9746159 0.9467494
## 2        Decision Tree (Max Depth=5) 0.9031854 0.9204321 0.9746159 0.9467494
## 3          Random Forest (100 Trees) 0.9056924 0.9264811 0.9701904 0.9478321
## 4          Random Forest (200 Trees) 0.9076832 0.9273191 0.9716099 0.9489480
## 5           Adaboost (50 Iterations) 0.9051762 0.9316750 0.9632599 0.9472042
## 6          Adaboost (100 Iterations) 0.9044389 0.9309232 0.9632599 0.9468155
## Accuracy         SVM (Radial Kernel) 0.8980976 0.9126032 0.9782899 0.9443056
##                AUC
## 1        0.6692121
## 2        0.6692121
## 3        0.6944268
## 4        0.6982892
## 5        0.7149212
## 6        0.7117686
## Accuracy 0.9027437

ML_2

Ursula Podosenin

2025-03-23