library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import Data

# Load the dataset
data <- read_excel("bank_latih_clean.xlsx")
head(data)
## # A tibble: 6 × 17
##     Age job        marital education default balance housing loan  contact   day
##   <dbl> <chr>      <chr>   <chr>     <chr>     <dbl> <chr>   <chr> <chr>   <dbl>
## 1    30 unemployed married primary   no         1787 no      no    cellul…    19
## 2    33 services   married secondary no         4789 yes     yes   cellul…    11
## 3    35 management single  tertiary  no         1350 yes     no    cellul…    16
## 4    30 management married tertiary  no         1476 yes     yes   cellul…     3
## 5    59 blue-coll… married secondary no            0 yes     no    cellul…     5
## 6    35 management single  tertiary  no          747 no      no    cellul…    23
## # ℹ 7 more variables: month <chr>, duration <dbl>, campaign <dbl>, pdays <dbl>,
## #   previous <dbl>, poutcome <chr>, y <chr>
str(data)
## tibble [4,521 × 17] (S3: tbl_df/tbl/data.frame)
##  $ Age      : num [1:4521] 30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr [1:4521] "unemployed" "services" "management" "management" ...
##  $ marital  : chr [1:4521] "married" "married" "single" "married" ...
##  $ education: chr [1:4521] "primary" "secondary" "tertiary" "tertiary" ...
##  $ default  : chr [1:4521] "no" "no" "no" "no" ...
##  $ balance  : num [1:4521] 1787 4789 1350 1476 0 ...
##  $ housing  : chr [1:4521] "no" "yes" "yes" "yes" ...
##  $ loan     : chr [1:4521] "no" "yes" "no" "yes" ...
##  $ contact  : chr [1:4521] "cellular" "cellular" "cellular" "cellular" ...
##  $ day      : num [1:4521] 19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr [1:4521] "oct" "may" "apr" "jun" ...
##  $ duration : num [1:4521] 79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : num [1:4521] 1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : num [1:4521] -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : num [1:4521] 0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr [1:4521] "unknown" "failure" "failure" "unknown" ...
##  $ y        : chr [1:4521] "no" "no" "no" "no" ...

Data Preparation dan Split Data

# Preparation data for classification method naive bayes, decision tree, knn, random forest and ann

data <- data %>%
  mutate_if(is.character, as.factor) %>%
  mutate_if(is.logical, as.factor) %>%
  mutate_if(is.integer, as.numeric) %>%
  mutate_if(is.double, as.numeric)

library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)

set.seed(123)
# Split the data into training and testing sets
trainIndex <- createDataPartition(data$y, p = .8, 
                                  list = FALSE, 
                                  times = 1)
train_data <- data[trainIndex, ]
test_data <- data[-trainIndex, ]

Model Naive Bayes

Data Testing

# Model Naive Bayes
model_nb <- naiveBayes(y ~ ., data = train_data)

# Predicting on the test set
predictions_nb <- predict(model_nb, newdata = test_data)

# Evaluating the model
confusion_nb <- confusionMatrix(predictions_nb, test_data$y)
confusion_nb
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  745  54
##        yes  55  50
##                                           
##                Accuracy : 0.8794          
##                  95% CI : (0.8564, 0.8999)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.7198          
##                                           
##                   Kappa : 0.4103          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.9313          
##             Specificity : 0.4808          
##          Pos Pred Value : 0.9324          
##          Neg Pred Value : 0.4762          
##              Prevalence : 0.8850          
##          Detection Rate : 0.8241          
##    Detection Prevalence : 0.8838          
##       Balanced Accuracy : 0.7060          
##                                           
##        'Positive' Class : no              
## 
# Make precision recall and F1-score
precision_nb <- confusion_nb$byClass['Precision']
recall_nb <- confusion_nb$byClass['Recall']
f1_score_nb <- 2 * ((precision_nb * recall_nb) / (precision_nb + recall_nb))

# Extracting accuracy
accuracy_nb <- confusion_nb$overall['Accuracy']

#Make in the table for the precision, recall, f1-score and accuracy
results_nb <- data.frame(
  Model = "Naive Bayes",
  Precision = precision_nb,
  Recall = recall_nb,
  F1_Score = f1_score_nb,
  Accuracy = accuracy_nb
)
results_nb
##                 Model Precision  Recall  F1_Score  Accuracy
## Precision Naive Bayes 0.9324155 0.93125 0.9318324 0.8794248

Data Training

# Model Naive Bayes untuk Data Training
train_pred_nb <- predict(model_nb, newdata = train_data)

# Evaluating the model
confusion_train_nb <- confusionMatrix(train_pred_nb, train_data$y)
confusion_train_nb
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2943  205
##        yes  257  212
##                                         
##                Accuracy : 0.8723        
##                  95% CI : (0.861, 0.883)
##     No Information Rate : 0.8847        
##     P-Value [Acc > NIR] : 0.99035       
##                                         
##                   Kappa : 0.4061        
##                                         
##  Mcnemar's Test P-Value : 0.01766       
##                                         
##             Sensitivity : 0.9197        
##             Specificity : 0.5084        
##          Pos Pred Value : 0.9349        
##          Neg Pred Value : 0.4520        
##              Prevalence : 0.8847        
##          Detection Rate : 0.8137        
##    Detection Prevalence : 0.8703        
##       Balanced Accuracy : 0.7140        
##                                         
##        'Positive' Class : no            
## 
# Make precision recall and F1-score
precision_train_nb <- confusion_train_nb$byClass['Precision']
recall_train_nb <- confusion_train_nb$byClass['Recall']
f1_score_train_nb <- 2 * ((precision_train_nb * recall_train_nb) / (precision_train_nb + recall_train_nb))

# Extracting accuracy
accuracy_train_nb <- confusion_train_nb$overall['Accuracy']

#Make in the table for the precision, recall, f1-score and accuracy
results_train_nb <- data.frame(
  Model = "Naive Bayes Train",
  Precision = precision_train_nb,
  Recall = recall_train_nb,
  F1_Score = f1_score_train_nb,
  Accuracy = accuracy_train_nb
)
results_train_nb
##                       Model Precision    Recall  F1_Score  Accuracy
## Precision Naive Bayes Train 0.9348793 0.9196875 0.9272212 0.8722698

Model Decision Tree

Data Testing

library(rpart)
library(caret)

# Model Decision Tree
model_dt <- rpart(y ~ ., data = train_data, method = "class")

# Predicting on the test set
predictions_dt <- predict(model_dt, newdata = test_data, type = "class")

# Evaluating the model
confusion_dt <- confusionMatrix(predictions_dt, test_data$y)
confusion_dt
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  775  63
##        yes  25  41
##                                           
##                Accuracy : 0.9027          
##                  95% CI : (0.8814, 0.9212)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.05058         
##                                           
##                   Kappa : 0.4316          
##                                           
##  Mcnemar's Test P-Value : 8.006e-05       
##                                           
##             Sensitivity : 0.9688          
##             Specificity : 0.3942          
##          Pos Pred Value : 0.9248          
##          Neg Pred Value : 0.6212          
##              Prevalence : 0.8850          
##          Detection Rate : 0.8573          
##    Detection Prevalence : 0.9270          
##       Balanced Accuracy : 0.6815          
##                                           
##        'Positive' Class : no              
## 
# Precision, Recall, F1
precision_dt <- confusion_dt$byClass['Precision']
recall_dt <- confusion_dt$byClass['Recall']
f1_score_dt <- 2 * ((precision_dt * recall_dt) / (precision_dt + recall_dt))

# Accuracy
accuracy_dt <- confusion_dt$overall['Accuracy']

# Buat tabel hasil
results_dt <- data.frame(
  Model = "Decision Tree",
  Precision = precision_dt,
  Recall = recall_dt,
  F1_Score = f1_score_dt,
  Accuracy = accuracy_dt
)

print(results_dt)
##                   Model Precision  Recall  F1_Score  Accuracy
## Precision Decision Tree  0.924821 0.96875 0.9462759 0.9026549

Data Training

library(rpart)
library(caret)

# Prediksi pada data training
train_pred <- predict(model_dt, newdata = train_data, type = "class")

# Confusion matrix
confusion_train <- confusionMatrix(train_pred, train_data$y)
confusion_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3117  223
##        yes   83  194
##                                           
##                Accuracy : 0.9154          
##                  95% CI : (0.9059, 0.9243)
##     No Information Rate : 0.8847          
##     P-Value [Acc > NIR] : 1.005e-09       
##                                           
##                   Kappa : 0.5144          
##                                           
##  Mcnemar's Test P-Value : 1.925e-15       
##                                           
##             Sensitivity : 0.9741          
##             Specificity : 0.4652          
##          Pos Pred Value : 0.9332          
##          Neg Pred Value : 0.7004          
##              Prevalence : 0.8847          
##          Detection Rate : 0.8618          
##    Detection Prevalence : 0.9234          
##       Balanced Accuracy : 0.7196          
##                                           
##        'Positive' Class : no              
## 
# Precision, Recall, F1-Score
precision_train <- confusion_train$byClass['Precision']
recall_train <- confusion_train$byClass['Recall']
f1_score_train <- 2 * ((precision_train * recall_train) / (precision_train + recall_train))

# Accuracy
accuracy_train <- confusion_train$overall['Accuracy']

# Buat tabel hasil
results_train <- data.frame(
  Model = "Decision Tree Train",
  Precision = precision_train,
  Recall = recall_train,
  F1_Score = f1_score_train,
  Accuracy = accuracy_train
)

print(results_train)
##                         Model Precision    Recall F1_Score  Accuracy
## Precision Decision Tree Train 0.9332335 0.9740625 0.953211 0.9153995

Model Random Forest

Data Testing

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# Model Random Forest
model_rf <- randomForest(y ~ ., data = train_data, ntree = 100)

# Predicting on the test set
predictions_rf <- predict(model_rf, newdata = test_data)

# Evaluating the model
confusion_rf <- confusionMatrix(predictions_rf, test_data$y)
confusion_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  773  61
##        yes  27  43
##                                           
##                Accuracy : 0.9027          
##                  95% CI : (0.8814, 0.9212)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.0505847       
##                                           
##                   Kappa : 0.4427          
##                                           
##  Mcnemar's Test P-Value : 0.0004351       
##                                           
##             Sensitivity : 0.9663          
##             Specificity : 0.4135          
##          Pos Pred Value : 0.9269          
##          Neg Pred Value : 0.6143          
##              Prevalence : 0.8850          
##          Detection Rate : 0.8551          
##    Detection Prevalence : 0.9226          
##       Balanced Accuracy : 0.6899          
##                                           
##        'Positive' Class : no              
## 
# Make precision recall and F1-score
precision_rf <- confusion_rf$byClass['Precision']
recall_rf <- confusion_rf$byClass['Recall']
f1_score_rf <- 2 * ((precision_rf * recall_rf) / (precision_rf + recall_rf))

# Extracting accuracy
accuracy_rf <- confusion_rf$overall['Accuracy']

#Make in the table for the precision, recall, f1-score and accuracy
results_rf <- data.frame(
  Model = "Random Forest",
  Precision = precision_rf,
  Recall = recall_rf,
  F1_Score = f1_score_rf,
  Accuracy = accuracy_rf
)
results_rf
##                   Model Precision  Recall  F1_Score  Accuracy
## Precision Random Forest 0.9268585 0.96625 0.9461444 0.9026549

Data Training

# Training for random forest

# predict menggunakan train
train_pred_rf <- predict(model_rf, newdata = train_data)

# metrik evaluasi 
confusion_train_rf <- confusionMatrix(train_pred_rf, train_data$y)
confusion_train_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3200    2
##        yes    0  415
##                                          
##                Accuracy : 0.9994         
##                  95% CI : (0.998, 0.9999)
##     No Information Rate : 0.8847         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9973         
##                                          
##  Mcnemar's Test P-Value : 0.4795         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.9952         
##          Pos Pred Value : 0.9994         
##          Neg Pred Value : 1.0000         
##              Prevalence : 0.8847         
##          Detection Rate : 0.8847         
##    Detection Prevalence : 0.8853         
##       Balanced Accuracy : 0.9976         
##                                          
##        'Positive' Class : no             
## 
# Make precision recall and F1-score
precision_train_rf <- confusion_train_rf$byClass['Precision']
recall_train_rf <- confusion_train_rf$byClass['Recall']
f1_score_train_rf <- 2 * ((precision_train_rf * recall_train_rf) / (precision_train_rf + recall_train_rf))

# Extracting accuracy
accuracy_train_rf <- confusion_train_rf$overall['Accuracy']

#Make in the table for the precision, recall, f1-score and accuracy
results_train_rf <- data.frame(
  Model = "Random Forest Train",
  Precision = precision_train_rf,
  Recall = recall_train_rf,
  F1_Score = f1_score_train_rf,
  Accuracy = accuracy_train_rf
)

results_train_rf
##                         Model Precision Recall  F1_Score  Accuracy
## Precision Random Forest Train 0.9993754      1 0.9996876 0.9994471