# Load library
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(class)
library(rpart)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(nnet)
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 4.3.3
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
# Load data
data <- read.csv("bank_latih_cleaned.csv")
# Convert character columns to factor
data[] <- lapply(data, function(x) if (is.character(x)) as.factor(x) else x)
# Seleksi data yang digunakan
data <- subset(data, select = -c( day, month, contact, pdays, campaign, previous, duration_group, campaign_group, pdays_group, previous_group, age_group, balance_z))
str(data)
## 'data.frame': 4311 obs. of 11 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 4 2 2 4 2 2 2 2 ...
## $ education: Factor w/ 5 levels "primary","secondary",..: 1 2 4 5 2 4 4 2 4 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 2 1 2 1 1 1 1 1 2 ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
# Bagi data (80% train, 20% test)
set.seed(123)
split <- createDataPartition(data$y, p = 0.8, list = FALSE)
train_data <- data[split, ]
test_data <- data[-split, ]
#Fungsi baru: hitung confusion matrix
evaluate_model <- function(model, train_pred, test_pred, train_true, test_true) {
cat("\n===== Model:", model, "=====\n")
cat("\n--- Training Confusion Matrix ---\n")
print(confusionMatrix(as.factor(train_pred), as.factor(train_true), positive = "yes"))
cat("\n--- Testing Confusion Matrix ---\n")
print(confusionMatrix(as.factor(test_pred), as.factor(test_true), positive = "yes"))
}
# Fungsi hitung akurasi, F1, Precision, Recall
evaluate_model1 <- function(model, train_pred, test_pred, train_true, test_true) {
cat("\n===== Model:", model, "=====\n")
cat("Training Accuracy :", mean(train_pred == train_true), "\n")
cat("Testing Accuracy :", mean(test_pred == test_true), "\n")
cat("Training Precision:", Precision(train_true, train_pred, positive = "yes"), "\n")
cat("Testing Precision :", Precision(test_true, test_pred, positive = "yes"), "\n")
cat("Training Recall :", Recall(train_true, train_pred, positive = "yes"), "\n")
cat("Testing Recall :", Recall(test_true, test_pred, positive = "yes"), "\n")
cat("Training F1 Score :", F1_Score(train_true, train_pred, positive = "yes"), "\n")
cat("Testing F1 Score :", F1_Score(test_true, test_pred, positive = "yes"), "\n")
}
# Ambil hanya data numerik
numeric_cols <- sapply(data, is.numeric)
data_numeric <- data[, numeric_cols]
# Normalisasi numerik
normalize <- function(x) { (x - min(x)) / (max(x) - min(x)) }
data_numeric_norm <- as.data.frame(lapply(data_numeric, normalize))
# Gabungkan kembali dengan target
data_knn <- cbind(data_numeric_norm, y = data$y)
# Split data (pakai split yang sama sebelumnya)
train_knn <- data_knn[split, ]
test_knn <- data_knn[-split, ]
# Pastikan tidak ada NA
train_knn <- na.omit(train_knn)
test_knn <- na.omit(test_knn)
# KNN membutuhkan data.frame tanpa target
train_X <- subset(train_knn, select = -y)
test_X <- subset(test_knn, select = -y)
train_y <- train_knn$y
test_y <- test_knn$y
# Jalankan KNN
library(class)
knn_pred_train <- knn(train = train_X, test = train_X, cl = train_y, k = 5)
knn_pred_test <- knn(train = train_X, test = test_X, cl = train_y, k = 5)
# Evaluasi hasil
evaluate_model("KNN", knn_pred_train, knn_pred_test, train_y, test_y)
##
## ===== Model: KNN =====
##
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2972 252
## yes 79 147
##
## Accuracy : 0.9041
## 95% CI : (0.8937, 0.9137)
## No Information Rate : 0.8843
## P-Value [Acc > NIR] : 0.0001159
##
## Kappa : 0.4221
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.36842
## Specificity : 0.97411
## Pos Pred Value : 0.65044
## Neg Pred Value : 0.92184
## Prevalence : 0.11565
## Detection Rate : 0.04261
## Detection Prevalence : 0.06551
## Balanced Accuracy : 0.67126
##
## 'Positive' Class : yes
##
##
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 738 72
## yes 24 27
##
## Accuracy : 0.8885
## 95% CI : (0.8656, 0.9087)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.3996
##
## Kappa : 0.3057
##
## Mcnemar's Test P-Value : 1.611e-06
##
## Sensitivity : 0.27273
## Specificity : 0.96850
## Pos Pred Value : 0.52941
## Neg Pred Value : 0.91111
## Prevalence : 0.11498
## Detection Rate : 0.03136
## Detection Prevalence : 0.05923
## Balanced Accuracy : 0.62062
##
## 'Positive' Class : yes
##
evaluate_model1("KNN", knn_pred_train, knn_pred_test, train_y, test_y)
##
## ===== Model: KNN =====
## Training Accuracy : 0.904058
## Testing Accuracy : 0.8885017
## Training Precision: 0.6504425
## Testing Precision : 0.5294118
## Training Recall : 0.3684211
## Testing Recall : 0.2727273
## Training F1 Score : 0.4704
## Testing F1 Score : 0.36
nb_model <- naiveBayes(y ~ ., data = train_data)
nb_train_pred <- predict(nb_model, train_data)
nb_test_pred <- predict(nb_model, test_data)
evaluate_model("Naive Bayes", nb_train_pred, nb_test_pred, train_data$y, test_data$y)
##
## ===== Model: Naive Bayes =====
##
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2929 226
## yes 122 173
##
## Accuracy : 0.8991
## 95% CI : (0.8886, 0.909)
## No Information Rate : 0.8843
## P-Value [Acc > NIR] : 0.003132
##
## Kappa : 0.4439
##
## Mcnemar's Test P-Value : 3.363e-08
##
## Sensitivity : 0.43358
## Specificity : 0.96001
## Pos Pred Value : 0.58644
## Neg Pred Value : 0.92837
## Prevalence : 0.11565
## Detection Rate : 0.05014
## Detection Prevalence : 0.08551
## Balanced Accuracy : 0.69680
##
## 'Positive' Class : yes
##
##
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 740 66
## yes 22 33
##
## Accuracy : 0.8978
## 95% CI : (0.8756, 0.9172)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.1301
##
## Kappa : 0.3774
##
## Mcnemar's Test P-Value : 4.566e-06
##
## Sensitivity : 0.33333
## Specificity : 0.97113
## Pos Pred Value : 0.60000
## Neg Pred Value : 0.91811
## Prevalence : 0.11498
## Detection Rate : 0.03833
## Detection Prevalence : 0.06388
## Balanced Accuracy : 0.65223
##
## 'Positive' Class : yes
##
evaluate_model1("Naive Bayes", nb_train_pred, nb_test_pred, train_data$y, test_data$y)
##
## ===== Model: Naive Bayes =====
## Training Accuracy : 0.8991304
## Testing Accuracy : 0.8977933
## Training Precision: 0.5864407
## Testing Precision : 0.6
## Training Recall : 0.433584
## Testing Recall : 0.3333333
## Training F1 Score : 0.4985591
## Testing F1 Score : 0.4285714
dt_model <- rpart(y ~ ., data = train_data, method = "class")
dt_train_pred <- predict(dt_model, train_data, type = "class")
dt_test_pred <- predict(dt_model, test_data, type = "class")
evaluate_model("Decision Tree", dt_train_pred, dt_test_pred, train_data$y, test_data$y)
##
## ===== Model: Decision Tree =====
##
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2982 253
## yes 69 146
##
## Accuracy : 0.9067
## 95% CI : (0.8965, 0.9162)
## No Information Rate : 0.8843
## P-Value [Acc > NIR] : 1.4e-05
##
## Kappa : 0.4294
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.36591
## Specificity : 0.97738
## Pos Pred Value : 0.67907
## Neg Pred Value : 0.92179
## Prevalence : 0.11565
## Detection Rate : 0.04232
## Detection Prevalence : 0.06232
## Balanced Accuracy : 0.67165
##
## 'Positive' Class : yes
##
##
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 751 72
## yes 11 27
##
## Accuracy : 0.9036
## 95% CI : (0.8819, 0.9225)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.04628
##
## Kappa : 0.3529
##
## Mcnemar's Test P-Value : 4.523e-11
##
## Sensitivity : 0.27273
## Specificity : 0.98556
## Pos Pred Value : 0.71053
## Neg Pred Value : 0.91252
## Prevalence : 0.11498
## Detection Rate : 0.03136
## Detection Prevalence : 0.04413
## Balanced Accuracy : 0.62915
##
## 'Positive' Class : yes
##
evaluate_model1("Decision Tree", dt_train_pred, dt_test_pred, train_data$y, test_data$y)
##
## ===== Model: Decision Tree =====
## Training Accuracy : 0.9066667
## Testing Accuracy : 0.9036005
## Training Precision: 0.6790698
## Testing Precision : 0.7105263
## Training Recall : 0.3659148
## Testing Recall : 0.2727273
## Training F1 Score : 0.47557
## Testing F1 Score : 0.3941606
rf_model <- randomForest(y ~ ., data = train_data, ntree = 100)
rf_train_pred <- predict(rf_model, train_data)
rf_test_pred <- predict(rf_model, test_data)
evaluate_model("Random Forest", rf_train_pred, rf_test_pred, train_data$y, test_data$y)
##
## ===== Model: Random Forest =====
##
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 3051 15
## yes 0 384
##
## Accuracy : 0.9957
## 95% CI : (0.9928, 0.9976)
## No Information Rate : 0.8843
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9784
##
## Mcnemar's Test P-Value : 0.0003006
##
## Sensitivity : 0.9624
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9951
## Prevalence : 0.1157
## Detection Rate : 0.1113
## Detection Prevalence : 0.1113
## Balanced Accuracy : 0.9812
##
## 'Positive' Class : yes
##
##
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 747 65
## yes 15 34
##
## Accuracy : 0.9071
## 95% CI : (0.8857, 0.9256)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.02166
##
## Kappa : 0.4149
##
## Mcnemar's Test P-Value : 4.293e-08
##
## Sensitivity : 0.34343
## Specificity : 0.98031
## Pos Pred Value : 0.69388
## Neg Pred Value : 0.91995
## Prevalence : 0.11498
## Detection Rate : 0.03949
## Detection Prevalence : 0.05691
## Balanced Accuracy : 0.66187
##
## 'Positive' Class : yes
##
evaluate_model1("Random Forest", rf_train_pred, rf_test_pred, train_data$y, test_data$y)
##
## ===== Model: Random Forest =====
## Training Accuracy : 0.9956522
## Testing Accuracy : 0.9070848
## Training Precision: 1
## Testing Precision : 0.6938776
## Training Recall : 0.962406
## Testing Recall : 0.3434343
## Training F1 Score : 0.9808429
## Testing F1 Score : 0.4594595
# Normalisasi data numerik
normalize <- function(x) { (x - min(x)) / (max(x) - min(x)) }
data_norm <- as.data.frame(lapply(data[, sapply(data, is.numeric)], normalize))
data_final <- cbind(data_norm, data[, !sapply(data, is.numeric)])
data_final$y <- as.factor(data_final$y)
train_norm <- data_final[split, ]
test_norm <- data_final[-split, ]
train_X <- subset(train_norm, select = -y)
train_y <- train_norm$y
test_X <- subset(test_norm, select = -y)
test_y <- test_norm$y
# Preprocessing untuk neural network: normalisasi dan target -> dummy
nn_data <- data_final
nn_train <- nn_data[split, ]
nn_test <- nn_data[-split, ]
nn_model <- nnet(y ~ ., data = nn_train, size = 5, maxit = 200, decay = 0.01, trace = FALSE)
nn_train_prob <- predict(nn_model, nn_train, type = "raw")
nn_test_prob <- predict(nn_model, nn_test, type = "raw")
nn_train_pred <- ifelse(nn_train_prob > 0.5, "yes", "no")
nn_test_pred <- ifelse(nn_test_prob > 0.5, "yes", "no")
evaluate_model("Neural Network", nn_train_pred, nn_test_pred, nn_train$y, nn_test$y)
##
## ===== Model: Neural Network =====
##
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2978 227
## yes 73 172
##
## Accuracy : 0.913
## 95% CI : (0.9031, 0.9222)
## No Information Rate : 0.8843
## P-Value [Acc > NIR] : 2.554e-08
##
## Kappa : 0.4892
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.43108
## Specificity : 0.97607
## Pos Pred Value : 0.70204
## Neg Pred Value : 0.92917
## Prevalence : 0.11565
## Detection Rate : 0.04986
## Detection Prevalence : 0.07101
## Balanced Accuracy : 0.70358
##
## 'Positive' Class : yes
##
##
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 745 73
## yes 17 26
##
## Accuracy : 0.8955
## 95% CI : (0.8731, 0.9151)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.1825
##
## Kappa : 0.3188
##
## Mcnemar's Test P-Value : 6.731e-09
##
## Sensitivity : 0.26263
## Specificity : 0.97769
## Pos Pred Value : 0.60465
## Neg Pred Value : 0.91076
## Prevalence : 0.11498
## Detection Rate : 0.03020
## Detection Prevalence : 0.04994
## Balanced Accuracy : 0.62016
##
## 'Positive' Class : yes
##
evaluate_model1("Neural Network", nn_train_pred, nn_test_pred, nn_train$y, nn_test$y)
##
## ===== Model: Neural Network =====
## Training Accuracy : 0.9130435
## Testing Accuracy : 0.8954704
## Training Precision: 0.7020408
## Testing Precision : 0.6046512
## Training Recall : 0.4310777
## Testing Recall : 0.2626263
## Training F1 Score : 0.5341615
## Testing F1 Score : 0.3661972