library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Import Data
# Load the dataset
data <- read_excel("bank_latih_clean.xlsx")
head(data)
## # A tibble: 6 × 17
## Age job marital education default balance housing loan contact day
## <dbl> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 30 unemployed married primary no 1787 no no cellul… 19
## 2 33 services married secondary no 4789 yes yes cellul… 11
## 3 35 management single tertiary no 1350 yes no cellul… 16
## 4 30 management married tertiary no 1476 yes yes cellul… 3
## 5 59 blue-coll… married secondary no 0 yes no cellul… 5
## 6 35 management single tertiary no 747 no no cellul… 23
## # ℹ 7 more variables: month <chr>, duration <dbl>, campaign <dbl>, pdays <dbl>,
## # previous <dbl>, poutcome <chr>, y <chr>
str(data)
## tibble [4,521 × 17] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:4521] 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr [1:4521] "unemployed" "services" "management" "management" ...
## $ marital : chr [1:4521] "married" "married" "single" "married" ...
## $ education: chr [1:4521] "primary" "secondary" "tertiary" "tertiary" ...
## $ default : chr [1:4521] "no" "no" "no" "no" ...
## $ balance : num [1:4521] 1787 4789 1350 1476 0 ...
## $ housing : chr [1:4521] "no" "yes" "yes" "yes" ...
## $ loan : chr [1:4521] "no" "yes" "no" "yes" ...
## $ contact : chr [1:4521] "cellular" "cellular" "cellular" "cellular" ...
## $ day : num [1:4521] 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr [1:4521] "oct" "may" "apr" "jun" ...
## $ duration : num [1:4521] 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : num [1:4521] 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : num [1:4521] -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : num [1:4521] 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr [1:4521] "unknown" "failure" "failure" "unknown" ...
## $ y : chr [1:4521] "no" "no" "no" "no" ...
Data Preparation dan Split Data
# Preparation data for classification method naive bayes, decision tree, knn, random forest and ann
data <- data %>%
mutate_if(is.character, as.factor) %>%
mutate_if(is.logical, as.factor) %>%
mutate_if(is.integer, as.numeric) %>%
mutate_if(is.double, as.numeric)
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
set.seed(123)
# Split the data into training and testing sets
trainIndex <- createDataPartition(data$y, p = .8,
list = FALSE,
times = 1)
train_data <- data[trainIndex, ]
test_data <- data[-trainIndex, ]
Model Naive Bayes
Data Testing
# Model Naive Bayes
model_nb <- naiveBayes(y ~ ., data = train_data)
# Predicting on the test set
predictions_nb <- predict(model_nb, newdata = test_data)
# Evaluating the model
confusion_nb <- confusionMatrix(predictions_nb, test_data$y)
confusion_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 745 54
## yes 55 50
##
## Accuracy : 0.8794
## 95% CI : (0.8564, 0.8999)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.7198
##
## Kappa : 0.4103
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.9313
## Specificity : 0.4808
## Pos Pred Value : 0.9324
## Neg Pred Value : 0.4762
## Prevalence : 0.8850
## Detection Rate : 0.8241
## Detection Prevalence : 0.8838
## Balanced Accuracy : 0.7060
##
## 'Positive' Class : no
##
# Make precision recall and F1-score
precision_nb <- confusion_nb$byClass['Precision']
recall_nb <- confusion_nb$byClass['Recall']
f1_score_nb <- 2 * ((precision_nb * recall_nb) / (precision_nb + recall_nb))
# Extracting accuracy
accuracy_nb <- confusion_nb$overall['Accuracy']
#Make in the table for the precision, recall, f1-score and accuracy
results_nb <- data.frame(
Model = "Naive Bayes",
Precision = precision_nb,
Recall = recall_nb,
F1_Score = f1_score_nb,
Accuracy = accuracy_nb
)
results_nb
## Model Precision Recall F1_Score Accuracy
## Precision Naive Bayes 0.9324155 0.93125 0.9318324 0.8794248
Data Training
# Model Naive Bayes untuk Data Training
train_pred_nb <- predict(model_nb, newdata = train_data)
# Evaluating the model
confusion_train_nb <- confusionMatrix(train_pred_nb, train_data$y)
confusion_train_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2943 205
## yes 257 212
##
## Accuracy : 0.8723
## 95% CI : (0.861, 0.883)
## No Information Rate : 0.8847
## P-Value [Acc > NIR] : 0.99035
##
## Kappa : 0.4061
##
## Mcnemar's Test P-Value : 0.01766
##
## Sensitivity : 0.9197
## Specificity : 0.5084
## Pos Pred Value : 0.9349
## Neg Pred Value : 0.4520
## Prevalence : 0.8847
## Detection Rate : 0.8137
## Detection Prevalence : 0.8703
## Balanced Accuracy : 0.7140
##
## 'Positive' Class : no
##
# Make precision recall and F1-score
precision_train_nb <- confusion_train_nb$byClass['Precision']
recall_train_nb <- confusion_train_nb$byClass['Recall']
f1_score_train_nb <- 2 * ((precision_train_nb * recall_train_nb) / (precision_train_nb + recall_train_nb))
# Extracting accuracy
accuracy_train_nb <- confusion_train_nb$overall['Accuracy']
#Make in the table for the precision, recall, f1-score and accuracy
results_train_nb <- data.frame(
Model = "Naive Bayes Train",
Precision = precision_train_nb,
Recall = recall_train_nb,
F1_Score = f1_score_train_nb,
Accuracy = accuracy_train_nb
)
results_train_nb
## Model Precision Recall F1_Score Accuracy
## Precision Naive Bayes Train 0.9348793 0.9196875 0.9272212 0.8722698
Model Decision Tree
Data Testing
library(rpart)
library(caret)
# Model Decision Tree
model_dt <- rpart(y ~ ., data = train_data, method = "class")
# Predicting on the test set
predictions_dt <- predict(model_dt, newdata = test_data, type = "class")
# Evaluating the model
confusion_dt <- confusionMatrix(predictions_dt, test_data$y)
confusion_dt
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 775 63
## yes 25 41
##
## Accuracy : 0.9027
## 95% CI : (0.8814, 0.9212)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.05058
##
## Kappa : 0.4316
##
## Mcnemar's Test P-Value : 8.006e-05
##
## Sensitivity : 0.9688
## Specificity : 0.3942
## Pos Pred Value : 0.9248
## Neg Pred Value : 0.6212
## Prevalence : 0.8850
## Detection Rate : 0.8573
## Detection Prevalence : 0.9270
## Balanced Accuracy : 0.6815
##
## 'Positive' Class : no
##
# Precision, Recall, F1
precision_dt <- confusion_dt$byClass['Precision']
recall_dt <- confusion_dt$byClass['Recall']
f1_score_dt <- 2 * ((precision_dt * recall_dt) / (precision_dt + recall_dt))
# Accuracy
accuracy_dt <- confusion_dt$overall['Accuracy']
# Buat tabel hasil
results_dt <- data.frame(
Model = "Decision Tree",
Precision = precision_dt,
Recall = recall_dt,
F1_Score = f1_score_dt,
Accuracy = accuracy_dt
)
print(results_dt)
## Model Precision Recall F1_Score Accuracy
## Precision Decision Tree 0.924821 0.96875 0.9462759 0.9026549
Data Training
library(rpart)
library(caret)
# Prediksi pada data training
train_pred <- predict(model_dt, newdata = train_data, type = "class")
# Confusion matrix
confusion_train <- confusionMatrix(train_pred, train_data$y)
confusion_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 3117 223
## yes 83 194
##
## Accuracy : 0.9154
## 95% CI : (0.9059, 0.9243)
## No Information Rate : 0.8847
## P-Value [Acc > NIR] : 1.005e-09
##
## Kappa : 0.5144
##
## Mcnemar's Test P-Value : 1.925e-15
##
## Sensitivity : 0.9741
## Specificity : 0.4652
## Pos Pred Value : 0.9332
## Neg Pred Value : 0.7004
## Prevalence : 0.8847
## Detection Rate : 0.8618
## Detection Prevalence : 0.9234
## Balanced Accuracy : 0.7196
##
## 'Positive' Class : no
##
# Precision, Recall, F1-Score
precision_train <- confusion_train$byClass['Precision']
recall_train <- confusion_train$byClass['Recall']
f1_score_train <- 2 * ((precision_train * recall_train) / (precision_train + recall_train))
# Accuracy
accuracy_train <- confusion_train$overall['Accuracy']
# Buat tabel hasil
results_train <- data.frame(
Model = "Decision Tree Train",
Precision = precision_train,
Recall = recall_train,
F1_Score = f1_score_train,
Accuracy = accuracy_train
)
print(results_train)
## Model Precision Recall F1_Score Accuracy
## Precision Decision Tree Train 0.9332335 0.9740625 0.953211 0.9153995
Model Random Forest
Data Testing
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# Model Random Forest
model_rf <- randomForest(y ~ ., data = train_data, ntree = 100)
# Predicting on the test set
predictions_rf <- predict(model_rf, newdata = test_data)
# Evaluating the model
confusion_rf <- confusionMatrix(predictions_rf, test_data$y)
confusion_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 773 61
## yes 27 43
##
## Accuracy : 0.9027
## 95% CI : (0.8814, 0.9212)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.0505847
##
## Kappa : 0.4427
##
## Mcnemar's Test P-Value : 0.0004351
##
## Sensitivity : 0.9663
## Specificity : 0.4135
## Pos Pred Value : 0.9269
## Neg Pred Value : 0.6143
## Prevalence : 0.8850
## Detection Rate : 0.8551
## Detection Prevalence : 0.9226
## Balanced Accuracy : 0.6899
##
## 'Positive' Class : no
##
# Make precision recall and F1-score
precision_rf <- confusion_rf$byClass['Precision']
recall_rf <- confusion_rf$byClass['Recall']
f1_score_rf <- 2 * ((precision_rf * recall_rf) / (precision_rf + recall_rf))
# Extracting accuracy
accuracy_rf <- confusion_rf$overall['Accuracy']
#Make in the table for the precision, recall, f1-score and accuracy
results_rf <- data.frame(
Model = "Random Forest",
Precision = precision_rf,
Recall = recall_rf,
F1_Score = f1_score_rf,
Accuracy = accuracy_rf
)
results_rf
## Model Precision Recall F1_Score Accuracy
## Precision Random Forest 0.9268585 0.96625 0.9461444 0.9026549
Data Training
# Training for random forest
# predict menggunakan train
train_pred_rf <- predict(model_rf, newdata = train_data)
# metrik evaluasi
confusion_train_rf <- confusionMatrix(train_pred_rf, train_data$y)
confusion_train_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 3200 2
## yes 0 415
##
## Accuracy : 0.9994
## 95% CI : (0.998, 0.9999)
## No Information Rate : 0.8847
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9973
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.9952
## Pos Pred Value : 0.9994
## Neg Pred Value : 1.0000
## Prevalence : 0.8847
## Detection Rate : 0.8847
## Detection Prevalence : 0.8853
## Balanced Accuracy : 0.9976
##
## 'Positive' Class : no
##
# Make precision recall and F1-score
precision_train_rf <- confusion_train_rf$byClass['Precision']
recall_train_rf <- confusion_train_rf$byClass['Recall']
f1_score_train_rf <- 2 * ((precision_train_rf * recall_train_rf) / (precision_train_rf + recall_train_rf))
# Extracting accuracy
accuracy_train_rf <- confusion_train_rf$overall['Accuracy']
#Make in the table for the precision, recall, f1-score and accuracy
results_train_rf <- data.frame(
Model = "Random Forest Train",
Precision = precision_train_rf,
Recall = recall_train_rf,
F1_Score = f1_score_train_rf,
Accuracy = accuracy_train_rf
)
results_train_rf
## Model Precision Recall F1_Score Accuracy
## Precision Random Forest Train 0.9993754 1 0.9996876 0.9994471