# library
library(readxl) # untuk membaca file Excel
library(dplyr) # untuk manipulasi data
library(ggplot2) # untuk visualisasi
library(caret) # untuk evaluasi model
library(tidyr) # untuk manipulasi data
library(mice) # untuk imputasi data (jika diperlukan)
library(class) # untuk KNN
library(kknn) # untuk KNN dengan metode yang lebih fleksibel
library(randomForest) # untuk Random Forest
library(e1071) # untuk Naive Bayes dan SVM
library(nnet) # untuk Artificial Neural Network
library(party) # untuk Decision Tree
library(pROC) # untuk ROC Curve
Ringkasan Dataset
databank <- read.csv("bank latih.csv")
colnames(databank) <- tolower(colnames(databank))
str(databank)
## 'data.frame': 4521 obs. of 17 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr "unemployed" "services" "management" "management" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education: chr "primary" "secondary" "tertiary" "tertier" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : chr "no" "yes" "yes" "yes" ...
## $ loan : chr "no" "yes" "no" "yes" ...
## $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "10" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
summary(databank)
## age job marital education
## Min. :19.00 Length:4521 Length:4521 Length:4521
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :41.17
## 3rd Qu.:49.00
## Max. :87.00
## default balance housing loan
## Length:4521 Min. :-3313 Length:4521 Length:4521
## Class :character 1st Qu.: 69 Class :character Class :character
## Mode :character Median : 444 Mode :character Mode :character
## Mean : 1423
## 3rd Qu.: 1480
## Max. :71188
## contact day month duration
## Length:4521 Min. : 1.00 Length:4521 Min. : 4
## Class :character 1st Qu.: 9.00 Class :character 1st Qu.: 104
## Mode :character Median :16.00 Mode :character Median : 185
## Mean :15.92 Mean : 264
## 3rd Qu.:21.00 3rd Qu.: 329
## Max. :31.00 Max. :3025
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.00 Min. : 0.0000 Length:4521
## 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.00 Median : 0.0000 Mode :character
## Mean : 2.794 Mean : 39.77 Mean : 0.5426
## 3rd Qu.: 3.000 3rd Qu.: -1.00 3rd Qu.: 0.0000
## Max. :50.000 Max. :871.00 Max. :25.0000
## y
## Length:4521
## Class :character
## Mode :character
##
##
##
paste("Jumlah baris sebelum preprocessing:", nrow(databank))
## [1] "Jumlah baris sebelum preprocessing: 4521"
Preprocessing Data
# 1. Missing Value & Duplicates
databank <- na.omit(databank)
databank <- databank %>% distinct()
# 2. Validasi day dan month
databank$day <- as.numeric(databank$day)
databank <- databank %>% filter(day >= 1 & day <= 31)
valid_months <- c("jan", "feb", "mar", "apr", "may", "jun",
"jul", "aug", "sep", "oct", "nov", "dec")
databank$month <- tolower(as.character(databank$month))
month_map <- setNames(valid_months, as.character(1:12))
databank$month <- ifelse(databank$month %in% names(month_map),
month_map[databank$month],
databank$month)
databank$month <- ifelse(databank$month %in% valid_months, databank$month, "unknown")
databank <- databank %>% filter(!(month == "feb" & day > 29))
# 3. Validasi marital, education, dan variabel biner
databank$marital <- tolower(databank$marital)
databank$marital <- recode(databank$marital, "menikah" = "married", "cerai" = "divorced")
databank$marital <- ifelse(databank$marital %in% c("married", "single", "divorced"), databank$marital, "unknown")
databank$education <- tolower(databank$education)
databank$education <- recode(databank$education, "primari" = "primary", "sekunder" = "secondary", "tertier" = "tertiary")
databank$education <- ifelse(databank$education %in% c("primary", "secondary", "tertiary", "unknown"), databank$education, "unknown")
yesno_vars <- c("default", "housing", "loan", "y")
for (var in yesno_vars) {
databank[[var]] <- tolower(databank[[var]])
databank[[var]] <- recode(databank[[var]], "iya" = "yes", "tidak" = "no")
databank[[var]] <- ifelse(databank[[var]] %in% c("yes", "no"), databank[[var]], "unknown")
}
# 4. Validasi Variabel lainnya
databank$contact <- tolower(databank$contact)
databank$contact <- recode(databank$contact, "seluler" = "cellular")
databank$contact <- ifelse(databank$contact %in% c("cellular", "telephone", "unknown"), databank$contact, "unknown")
databank$poutcome <- tolower(databank$poutcome)
databank$poutcome <- ifelse(databank$poutcome %in% c("success", "failure", "other", "unknown"), databank$poutcome, "unknown")
databank <- databank %>%
mutate(campaign = as.numeric(campaign),
pdays = as.numeric(pdays),
previous = as.numeric(previous)) %>%
na.omit()
# 5. Konversi ke faktor
databank_clean <- databank %>% mutate(across(where(is.character), as.factor))
# 6. Menampilkan hasil preprocessing
print(str(databank_clean))
## 'data.frame': 4521 obs. of 17 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 3 2 2 3 2 2 2 2 ...
## $ education: Factor w/ 4 levels "primary","secondary",..: 1 2 3 3 2 3 3 2 3 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 2 1 2 1 1 1 1 1 2 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
## $ day : num 19 11 16 3 5 23 14 6 14 17 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 11 9 1 7 9 4 9 9 9 1 ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : num 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : num -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : num 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
# Menampilkan jumlah baris setelah preprocessing
print(paste("Jumlah baris setelah preprocessing:", nrow(databank_clean)))
## [1] "Jumlah baris setelah preprocessing: 4521"
# Menampilkan data setelah preprocessing
print(head(databank_clean))
## age job marital education default balance housing loan contact day
## 1 30 unemployed married primary no 1787 no no cellular 19
## 2 33 services married secondary no 4789 yes yes cellular 11
## 3 35 management single tertiary no 1350 yes no cellular 16
## 4 30 management married tertiary no 1476 yes yes unknown 3
## 5 59 blue-collar married secondary no 0 yes no unknown 5
## 6 35 management single tertiary no 747 no no cellular 23
## month duration campaign pdays previous poutcome y
## 1 oct 79 1 -1 0 unknown no
## 2 may 220 1 339 4 failure no
## 3 apr 185 1 330 1 failure no
## 4 jun 199 4 -1 0 unknown no
## 5 may 226 1 -1 0 unknown no
## 6 feb 141 2 176 3 failure no
#write.csv(databank_clean, file = "databank_cl.csv", row.names = FALSE)
#View(databank_clean)
# 7. Split Data
set.seed(123)
trainIndex <- createDataPartition(databank_clean$y, p = 0.8, list = FALSE)
train <- databank_clean[trainIndex, ]
test <- databank_clean[-trainIndex, ]
# 8. Pisahkan fitur dan label
train_features <- train[, sapply(train, is.numeric)]
test_features <- test[, sapply(test, is.numeric)]
train_labels <- factor(train$y)
test_labels <- factor(test$y)
Naive Bayes
# 1. Model Naive Bayes
model_nb <- naiveBayes(y ~ ., data = train)
# 2. Prediksi pada data training
pred_train_nb <- predict(model_nb, train)
# 3. Evaluasi training
CM_train_nb <- confusionMatrix(factor(pred_train_nb, levels = levels(train_labels)),
factor(train$y, levels = levels(train_labels)))
cat("\n=== Naive Bayes - Training ===\n")
##
## === Naive Bayes - Training ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_train_nb$table)
## Reference
## Prediction no yes
## no 2924 199
## yes 276 218
cat("Akurasi:", round(CM_train_nb$overall["Accuracy"], 4), "\n")
## Akurasi: 0.8687
cat("F1-score:", round(CM_train_nb$byClass["F1"], 4), "\n")
## F1-score: 0.9249
cat("Precision:", round(CM_train_nb$byClass["Precision"], 4), "\n")
## Precision: 0.9363
cat("Recall:", round(CM_train_nb$byClass["Recall"], 4), "\n")
## Recall: 0.9138
# 4. Prediksi pada data testing
pred_test_nb <- predict(model_nb, test)
# 5. Evaluasi testing
CM_test_nb <- confusionMatrix(factor(pred_test_nb, levels = levels(test_labels)),
factor(test$y, levels = levels(test_labels)))
cat("\n=== Naive Bayes - Testing ===\n")
##
## === Naive Bayes - Testing ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_test_nb$table)
## Reference
## Prediction no yes
## no 740 54
## yes 60 50
cat("Akurasi:", round(CM_test_nb$overall["Accuracy"], 4), "\n")
## Akurasi: 0.8739
cat("F1-score:", round(CM_test_nb$byClass["F1"], 4), "\n")
## F1-score: 0.9285
cat("Precision:", round(CM_test_nb$byClass["Precision"], 4), "\n")
## Precision: 0.932
cat("Recall:", round(CM_test_nb$byClass["Recall"], 4), "\n")
## Recall: 0.925
Decision Tree
# 1. Model decision tree
tree_party <- ctree(y ~ ., data = train)
# 2. Visualisasi pohon
plot(tree_party)

# 3. Prediksi pada data training
pred_train_dtree <- predict(tree_party, train)
# 4. Evaluasi training
CM_train_dtree <- confusionMatrix(factor(pred_train_dtree, levels = levels(train_labels)),
factor(train$y, levels = levels(train_labels)))
cat("\n=== Decision Tree - Training ===\n")
##
## === Decision Tree - Training ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_train_dtree$table)
## Reference
## Prediction no yes
## no 3094 228
## yes 106 189
cat("Akurasi:", round(CM_train_dtree$overall["Accuracy"], 4), "\n")
## Akurasi: 0.9077
cat("F1-score:", round(CM_train_dtree$byClass["F1"], 4), "\n")
## F1-score: 0.9488
cat("Precision:", round(CM_train_dtree$byClass["Precision"], 4), "\n")
## Precision: 0.9314
cat("Recall:", round(CM_train_dtree$byClass["Recall"], 4), "\n")
## Recall: 0.9669
# 5. Prediksi pada data testing
pred_test_dtree <- predict(tree_party, test)
# 6. Evaluasi testing
CM_test_dtree <- confusionMatrix(factor(pred_test_dtree, levels = c("yes","no")),
factor(test$y, levels = c("yes","no")), positive = "no")
cat("\n=== Decision Tree - Testing ===\n")
##
## === Decision Tree - Testing ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_test_dtree$table)
## Reference
## Prediction yes no
## yes 37 25
## no 67 775
cat("Akurasi:", round(CM_test_dtree$overall["Accuracy"], 4), "\n")
## Akurasi: 0.8982
cat("F1-score:", round(CM_test_dtree$byClass["F1"], 4), "\n")
## F1-score: 0.944
cat("Precision:", round(CM_test_dtree$byClass["Precision"], 4), "\n")
## Precision: 0.9204
cat("Recall:", round(CM_test_dtree$byClass["Recall"], 4), "\n")
## Recall: 0.9688
Random Forest
# 1. Model Random Forest
model_rf <- randomForest(y ~ ., data = train, maxnodes = 20)
# 2. Tampilkan importance variabel
cat("\n=== Variable Importance ===\n")
##
## === Variable Importance ===
print(importance(model_rf))
## MeanDecreaseGini
## age 9.4782036
## job 8.3573381
## marital 2.7382401
## education 1.6634201
## default 0.1933507
## balance 4.9653387
## housing 2.2947279
## loan 0.3397440
## contact 3.6819199
## day 4.9771176
## month 33.0832782
## duration 88.6101831
## campaign 1.9257391
## pdays 10.8628939
## previous 5.8133279
## poutcome 35.9489125
varImpPlot(model_rf)

# 3. Prediksi pada data training
pred_train_rf <- predict(model_rf, train)
# 4. Evaluasi training
CM_train_rf <- confusionMatrix(factor(pred_train_rf, levels = levels(train_labels)),
factor(train$y, levels = levels(train_labels)))
cat("\n=== Random Forest - Training ===\n")
##
## === Random Forest - Training ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_train_rf$table)
## Reference
## Prediction no yes
## no 3194 337
## yes 6 80
cat("Akurasi:", round(CM_train_rf$overall["Accuracy"], 4), "\n")
## Akurasi: 0.9052
cat("F1-score:", round(CM_train_rf$byClass["F1"], 4), "\n")
## F1-score: 0.949
cat("Precision:", round(CM_train_rf$byClass["Precision"], 4), "\n")
## Precision: 0.9046
cat("Recall:", round(CM_train_rf$byClass["Recall"], 4), "\n")
## Recall: 0.9981
# 5. Prediksi pada data testing
pred_test_rf <- predict(model_rf, test)
# 6. Evaluasi testing
CM_test_rf <- confusionMatrix(factor(pred_test_rf, levels = c("no", "yes")),
factor(test$y, levels = c("no", "yes")))
cat("\n=== Random Forest - Testing ===\n")
##
## === Random Forest - Testing ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_test_rf$table)
## Reference
## Prediction no yes
## no 792 92
## yes 8 12
cat("Akurasi:", round(CM_test_rf$overall["Accuracy"], 4), "\n")
## Akurasi: 0.8894
cat("F1-score:", round(CM_test_rf$byClass["F1"], 4), "\n")
## F1-score: 0.9406
cat("Precision:", round(CM_test_rf$byClass["Precision"], 4), "\n")
## Precision: 0.8959
cat("Recall:", round(CM_test_rf$byClass["Recall"], 4), "\n")
## Recall: 0.99
KNN Model
# 1. Model KNN
model_knn <- train.kknn(y ~ ., data = train, kmax = 9)
model_knn
##
## Call:
## train.kknn(formula = y ~ ., data = train, kmax = 9)
##
## Type of response variable: nominal
## Minimal misclassification: 0.1050594
## Best kernel: optimal
## Best k: 9
model_knn$MISCLASS
## optimal
## 1 0.1235831
## 2 0.1235831
## 3 0.1235831
## 4 0.1235831
## 5 0.1078242
## 6 0.1058889
## 7 0.1069947
## 8 0.1064418
## 9 0.1050594
# 2. Prediksi
# Buat prediksi training
pred_train_knn <- predict(model_knn, newdata = train)
# Buat prediksi testing
pred_test_knn <- predict(model_knn, newdata = test)
# 4. Evaluasi - Train
CM_train_knn <- confusionMatrix(factor(pred_train_knn, levels = levels(train_labels)),
factor(train$y, levels = levels(train_labels)))
cat("\n=== KNN - Training ===\n")
##
## === KNN - Training ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_train_knn$table)
## Reference
## Prediction no yes
## no 3182 211
## yes 18 206
cat("Akurasi:", round(CM_train_knn$overall["Accuracy"], 4), "\n")
## Akurasi: 0.9367
cat("F1-score:", round(CM_train_knn$byClass["F1"], 4), "\n")
## F1-score: 0.9653
cat("Precision:", round(CM_train_knn$byClass["Precision"], 4), "\n")
## Precision: 0.9378
cat("Recall:", round(CM_train_knn$byClass["Recall"], 4), "\n")
## Recall: 0.9944
# 5. Evaluasi - Test
CM_test_knn <- confusionMatrix(factor(pred_test_knn, levels = levels(test_labels)),
factor(test$y, levels = levels(test_labels)))
cat("\n=== KNN - Testing ===\n")
##
## === KNN - Testing ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_test_knn$table)
## Reference
## Prediction no yes
## no 789 70
## yes 11 34
cat("Akurasi:", round(CM_test_knn$overall["Accuracy"], 4), "\n")
## Akurasi: 0.9104
cat("F1-score:", round(CM_test_knn$byClass["F1"], 4), "\n")
## F1-score: 0.9512
cat("Precision:", round(CM_test_knn$byClass["Precision"], 4), "\n")
## Precision: 0.9185
cat("Recall:", round(CM_test_knn$byClass["Recall"], 4), "\n")
## Recall: 0.9862
ANN Model
# 1. Model ANN
model_ann <- nnet(y ~ ., data = train, size = 5, maxit = 200, decay = 0.01)
## # weights: 221
## initial value 3067.322499
## iter 10 value 1280.104809
## iter 20 value 1142.019148
## iter 30 value 1119.943483
## iter 40 value 1114.681670
## iter 50 value 1110.810002
## iter 60 value 1098.683884
## iter 70 value 1059.832411
## iter 80 value 1015.319682
## iter 90 value 926.932038
## iter 100 value 880.332896
## iter 110 value 858.432733
## iter 120 value 834.937659
## iter 130 value 818.128660
## iter 140 value 804.276290
## iter 150 value 798.328264
## iter 160 value 792.932068
## iter 170 value 790.855898
## iter 180 value 787.415832
## iter 190 value 786.127963
## iter 200 value 785.017779
## final value 785.017779
## stopped after 200 iterations
# 2. Prediksi
pred_train_ann <- predict(model_ann, newdata = train, type = "class")
pred_test_ann <- predict(model_ann, newdata = test, type = "class")
# 3. Evaluasi - Train
CM_train_ann <- confusionMatrix(factor(pred_train_ann, levels = levels(train_labels)), train_labels)
cat("\n=== ANN - Training ===\n")
##
## === ANN - Training ===
print(CM_train_ann$table)
## Reference
## Prediction no yes
## no 3093 226
## yes 107 191
cat("Akurasi:", round(CM_train_ann$overall["Accuracy"], 4), "\n")
## Akurasi: 0.9079
cat("F1-score:", round(CM_train_ann$byClass["F1"], 4), "\n")
## F1-score: 0.9489
cat("Precision:", round(CM_train_ann$byClass["Precision"], 4), "\n")
## Precision: 0.9319
cat("Recall:", round(CM_train_ann$byClass["Recall"], 4), "\n")
## Recall: 0.9666
# 4. Evaluasi - Test
CM_test_ann <- confusionMatrix(factor(pred_test_ann, levels = levels(test_labels)), test_labels)
cat("\n=== ANN - Testing ===\n")
##
## === ANN - Testing ===
print(CM_test_ann$table)
## Reference
## Prediction no yes
## no 768 59
## yes 32 45
cat("Akurasi:", round(CM_test_ann$overall["Accuracy"], 4), "\n")
## Akurasi: 0.8993
cat("F1-score:", round(CM_test_ann$byClass["F1"], 4), "\n")
## F1-score: 0.9441
cat("Precision:", round(CM_test_ann$byClass["Precision"], 4), "\n")
## Precision: 0.9287
cat("Recall:", round(CM_test_ann$byClass["Recall"], 4), "\n")
## Recall: 0.96
Support Vector Machine (SVM)
# 1. Model SVM
model_svm <- svm(y ~ ., data = train, kernel = "linear")
# 2. Prediksi pada data training
pred_train_svm <- predict(model_svm, train)
# 3. Evaluasi training
CM_train_svm <- confusionMatrix(factor(pred_train_svm, levels = levels(train_labels)),
factor(train$y, levels = levels(train_labels)))
cat("\n=== SVM - Training ===\n")
##
## === SVM - Training ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_train_svm$table)
## Reference
## Prediction no yes
## no 3162 347
## yes 38 70
cat("Akurasi:", round(CM_train_svm$overall["Accuracy"], 4), "\n")
## Akurasi: 0.8936
cat("F1-score:", round(CM_train_svm$byClass["F1"], 4), "\n")
## F1-score: 0.9426
cat("Precision:", round(CM_train_svm$byClass["Precision"], 4), "\n")
## Precision: 0.9011
cat("Recall:", round(CM_train_svm$byClass["Recall"], 4), "\n")
## Recall: 0.9881
# 4. Prediksi pada data testing
pred_test_svm <- predict(model_svm, test)
# 5. Evaluasi testing
CM_test_svm <- confusionMatrix(factor(pred_test_svm, levels = levels(test_labels)),
factor(test$y, levels = levels(test_labels)))
cat("\n=== SVM - Testing ===\n")
##
## === SVM - Testing ===
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(CM_test_svm$table)
## Reference
## Prediction no yes
## no 792 91
## yes 8 13
cat("Akurasi:", round(CM_test_svm$overall["Accuracy"], 4), "\n")
## Akurasi: 0.8905
cat("F1-score:", round(CM_test_svm$byClass["F1"], 4), "\n")
## F1-score: 0.9412
cat("Precision:", round(CM_test_svm$byClass["Precision"], 4), "\n")
## Precision: 0.8969
cat("Recall:", round(CM_test_svm$byClass["Recall"], 4), "\n")
## Recall: 0.99
Visualisasi Perbandingan Akurasi
# Bandingkan akurasi semua model dalam satu bar chart
accuracy_results <- data.frame(
Model = c("ANN", "KNN", "Decision Tree", "Random Forest", "Naive Bayes", "SVM"),
Accuracy = c(CM_test_ann$overall["Accuracy"], CM_test_knn$overall["Accuracy"], CM_test_dtree$overall["Accuracy"], CM_test_rf$overall["Accuracy"], CM_test_nb$overall["Accuracy"], CM_test_svm$overall["Accuracy"])
)
ggplot(accuracy_results, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
ylim(0,1) +
ggtitle("Perbandingan Akurasi Antar Model") +
theme_minimal()
