Praktikum Data Mining: Pembentukan Model Klasifikasi Data Bank Latih

# Load library
library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.3

## Loading required package: lattice

library(e1071)

## Warning: package 'e1071' was built under R version 4.3.3

library(class)
library(rpart)
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.3.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(nnet)
library(MLmetrics)

## Warning: package 'MLmetrics' was built under R version 4.3.3

## 
## Attaching package: 'MLmetrics'

## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE

## The following object is masked from 'package:base':
## 
##     Recall

# Load data
data <- read.csv("bank_latih_cleaned.csv")

# Convert character columns to factor
data[] <- lapply(data, function(x) if (is.character(x)) as.factor(x) else x)

# Seleksi data yang digunakan
data <- subset(data, select = -c( day, month, contact, pdays, campaign, previous, duration_group, campaign_group, pdays_group, previous_group, age_group, balance_z))
str(data)

## 'data.frame':    4311 obs. of  11 variables:
##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : Factor w/ 11 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
##  $ marital  : Factor w/ 4 levels "divorced","married",..: 2 2 4 2 2 4 2 2 2 2 ...
##  $ education: Factor w/ 5 levels "primary","secondary",..: 1 2 4 5 2 4 4 2 4 1 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 2 1 2 1 1 1 1 1 2 ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

# Bagi data (80% train, 20% test)
set.seed(123)
split <- createDataPartition(data$y, p = 0.8, list = FALSE)
train_data <- data[split, ]
test_data  <- data[-split, ]


#Fungsi baru: hitung confusion matrix
evaluate_model <- function(model, train_pred, test_pred, train_true, test_true) {
  cat("\n===== Model:", model, "=====\n")

  cat("\n--- Training Confusion Matrix ---\n")
  print(confusionMatrix(as.factor(train_pred), as.factor(train_true), positive = "yes"))

  cat("\n--- Testing Confusion Matrix ---\n")
  print(confusionMatrix(as.factor(test_pred), as.factor(test_true), positive = "yes"))
}

# Fungsi hitung akurasi, F1, Precision, Recall 

evaluate_model1 <- function(model, train_pred, test_pred, train_true, test_true) {
  cat("\n===== Model:", model, "=====\n")

  cat("Training Accuracy :", mean(train_pred == train_true), "\n")
  cat("Testing Accuracy  :", mean(test_pred == test_true), "\n")

  cat("Training Precision:", Precision(train_true, train_pred, positive = "yes"), "\n")
  cat("Testing Precision :", Precision(test_true, test_pred, positive = "yes"), "\n")

  cat("Training Recall   :", Recall(train_true, train_pred, positive = "yes"), "\n")
  cat("Testing Recall    :", Recall(test_true, test_pred, positive = "yes"), "\n")

  cat("Training F1 Score :", F1_Score(train_true, train_pred, positive = "yes"), "\n")
  cat("Testing F1 Score  :", F1_Score(test_true, test_pred, positive = "yes"), "\n")
}

# Ambil hanya data numerik
numeric_cols <- sapply(data, is.numeric)
data_numeric <- data[, numeric_cols]

# Normalisasi numerik
normalize <- function(x) { (x - min(x)) / (max(x) - min(x)) }
data_numeric_norm <- as.data.frame(lapply(data_numeric, normalize))

# Gabungkan kembali dengan target
data_knn <- cbind(data_numeric_norm, y = data$y)

# Split data (pakai split yang sama sebelumnya)
train_knn <- data_knn[split, ]
test_knn  <- data_knn[-split, ]

# Pastikan tidak ada NA
train_knn <- na.omit(train_knn)
test_knn <- na.omit(test_knn)

# KNN membutuhkan data.frame tanpa target
train_X <- subset(train_knn, select = -y)
test_X  <- subset(test_knn, select = -y)
train_y <- train_knn$y
test_y  <- test_knn$y

# Jalankan KNN
library(class)
knn_pred_train <- knn(train = train_X, test = train_X, cl = train_y, k = 5)
knn_pred_test  <- knn(train = train_X, test = test_X, cl = train_y, k = 5)

# Evaluasi hasil
evaluate_model("KNN", knn_pred_train, knn_pred_test, train_y, test_y)

## 
## ===== Model: KNN =====
## 
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2972  252
##        yes   79  147
##                                           
##                Accuracy : 0.9041          
##                  95% CI : (0.8937, 0.9137)
##     No Information Rate : 0.8843          
##     P-Value [Acc > NIR] : 0.0001159       
##                                           
##                   Kappa : 0.4221          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.36842         
##             Specificity : 0.97411         
##          Pos Pred Value : 0.65044         
##          Neg Pred Value : 0.92184         
##              Prevalence : 0.11565         
##          Detection Rate : 0.04261         
##    Detection Prevalence : 0.06551         
##       Balanced Accuracy : 0.67126         
##                                           
##        'Positive' Class : yes             
##                                           
## 
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  738  72
##        yes  24  27
##                                           
##                Accuracy : 0.8885          
##                  95% CI : (0.8656, 0.9087)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.3996          
##                                           
##                   Kappa : 0.3057          
##                                           
##  Mcnemar's Test P-Value : 1.611e-06       
##                                           
##             Sensitivity : 0.27273         
##             Specificity : 0.96850         
##          Pos Pred Value : 0.52941         
##          Neg Pred Value : 0.91111         
##              Prevalence : 0.11498         
##          Detection Rate : 0.03136         
##    Detection Prevalence : 0.05923         
##       Balanced Accuracy : 0.62062         
##                                           
##        'Positive' Class : yes             
##

evaluate_model1("KNN", knn_pred_train, knn_pred_test, train_y, test_y)

## 
## ===== Model: KNN =====
## Training Accuracy : 0.904058 
## Testing Accuracy  : 0.8885017 
## Training Precision: 0.6504425 
## Testing Precision : 0.5294118 
## Training Recall   : 0.3684211 
## Testing Recall    : 0.2727273 
## Training F1 Score : 0.4704 
## Testing F1 Score  : 0.36

nb_model <- naiveBayes(y ~ ., data = train_data)
nb_train_pred <- predict(nb_model, train_data)
nb_test_pred  <- predict(nb_model, test_data)

evaluate_model("Naive Bayes", nb_train_pred, nb_test_pred, train_data$y, test_data$y)

## 
## ===== Model: Naive Bayes =====
## 
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2929  226
##        yes  122  173
##                                          
##                Accuracy : 0.8991         
##                  95% CI : (0.8886, 0.909)
##     No Information Rate : 0.8843         
##     P-Value [Acc > NIR] : 0.003132       
##                                          
##                   Kappa : 0.4439         
##                                          
##  Mcnemar's Test P-Value : 3.363e-08      
##                                          
##             Sensitivity : 0.43358        
##             Specificity : 0.96001        
##          Pos Pred Value : 0.58644        
##          Neg Pred Value : 0.92837        
##              Prevalence : 0.11565        
##          Detection Rate : 0.05014        
##    Detection Prevalence : 0.08551        
##       Balanced Accuracy : 0.69680        
##                                          
##        'Positive' Class : yes            
##                                          
## 
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  740  66
##        yes  22  33
##                                           
##                Accuracy : 0.8978          
##                  95% CI : (0.8756, 0.9172)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.1301          
##                                           
##                   Kappa : 0.3774          
##                                           
##  Mcnemar's Test P-Value : 4.566e-06       
##                                           
##             Sensitivity : 0.33333         
##             Specificity : 0.97113         
##          Pos Pred Value : 0.60000         
##          Neg Pred Value : 0.91811         
##              Prevalence : 0.11498         
##          Detection Rate : 0.03833         
##    Detection Prevalence : 0.06388         
##       Balanced Accuracy : 0.65223         
##                                           
##        'Positive' Class : yes             
##

evaluate_model1("Naive Bayes", nb_train_pred, nb_test_pred, train_data$y, test_data$y)

## 
## ===== Model: Naive Bayes =====
## Training Accuracy : 0.8991304 
## Testing Accuracy  : 0.8977933 
## Training Precision: 0.5864407 
## Testing Precision : 0.6 
## Training Recall   : 0.433584 
## Testing Recall    : 0.3333333 
## Training F1 Score : 0.4985591 
## Testing F1 Score  : 0.4285714

dt_model <- rpart(y ~ ., data = train_data, method = "class")
dt_train_pred <- predict(dt_model, train_data, type = "class")
dt_test_pred  <- predict(dt_model, test_data, type = "class")

evaluate_model("Decision Tree", dt_train_pred, dt_test_pred, train_data$y, test_data$y)

## 
## ===== Model: Decision Tree =====
## 
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2982  253
##        yes   69  146
##                                           
##                Accuracy : 0.9067          
##                  95% CI : (0.8965, 0.9162)
##     No Information Rate : 0.8843          
##     P-Value [Acc > NIR] : 1.4e-05         
##                                           
##                   Kappa : 0.4294          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.36591         
##             Specificity : 0.97738         
##          Pos Pred Value : 0.67907         
##          Neg Pred Value : 0.92179         
##              Prevalence : 0.11565         
##          Detection Rate : 0.04232         
##    Detection Prevalence : 0.06232         
##       Balanced Accuracy : 0.67165         
##                                           
##        'Positive' Class : yes             
##                                           
## 
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  751  72
##        yes  11  27
##                                           
##                Accuracy : 0.9036          
##                  95% CI : (0.8819, 0.9225)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.04628         
##                                           
##                   Kappa : 0.3529          
##                                           
##  Mcnemar's Test P-Value : 4.523e-11       
##                                           
##             Sensitivity : 0.27273         
##             Specificity : 0.98556         
##          Pos Pred Value : 0.71053         
##          Neg Pred Value : 0.91252         
##              Prevalence : 0.11498         
##          Detection Rate : 0.03136         
##    Detection Prevalence : 0.04413         
##       Balanced Accuracy : 0.62915         
##                                           
##        'Positive' Class : yes             
##

evaluate_model1("Decision Tree", dt_train_pred, dt_test_pred, train_data$y, test_data$y)

## 
## ===== Model: Decision Tree =====
## Training Accuracy : 0.9066667 
## Testing Accuracy  : 0.9036005 
## Training Precision: 0.6790698 
## Testing Precision : 0.7105263 
## Training Recall   : 0.3659148 
## Testing Recall    : 0.2727273 
## Training F1 Score : 0.47557 
## Testing F1 Score  : 0.3941606

rf_model <- randomForest(y ~ ., data = train_data, ntree = 100)
rf_train_pred <- predict(rf_model, train_data)
rf_test_pred  <- predict(rf_model, test_data)

evaluate_model("Random Forest", rf_train_pred, rf_test_pred, train_data$y, test_data$y)

## 
## ===== Model: Random Forest =====
## 
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3051   15
##        yes    0  384
##                                           
##                Accuracy : 0.9957          
##                  95% CI : (0.9928, 0.9976)
##     No Information Rate : 0.8843          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9784          
##                                           
##  Mcnemar's Test P-Value : 0.0003006       
##                                           
##             Sensitivity : 0.9624          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9951          
##              Prevalence : 0.1157          
##          Detection Rate : 0.1113          
##    Detection Prevalence : 0.1113          
##       Balanced Accuracy : 0.9812          
##                                           
##        'Positive' Class : yes             
##                                           
## 
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  747  65
##        yes  15  34
##                                           
##                Accuracy : 0.9071          
##                  95% CI : (0.8857, 0.9256)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.02166         
##                                           
##                   Kappa : 0.4149          
##                                           
##  Mcnemar's Test P-Value : 4.293e-08       
##                                           
##             Sensitivity : 0.34343         
##             Specificity : 0.98031         
##          Pos Pred Value : 0.69388         
##          Neg Pred Value : 0.91995         
##              Prevalence : 0.11498         
##          Detection Rate : 0.03949         
##    Detection Prevalence : 0.05691         
##       Balanced Accuracy : 0.66187         
##                                           
##        'Positive' Class : yes             
##

evaluate_model1("Random Forest", rf_train_pred, rf_test_pred, train_data$y, test_data$y)

## 
## ===== Model: Random Forest =====
## Training Accuracy : 0.9956522 
## Testing Accuracy  : 0.9070848 
## Training Precision: 1 
## Testing Precision : 0.6938776 
## Training Recall   : 0.962406 
## Testing Recall    : 0.3434343 
## Training F1 Score : 0.9808429 
## Testing F1 Score  : 0.4594595

# Normalisasi data numerik
normalize <- function(x) { (x - min(x)) / (max(x) - min(x)) }
data_norm <- as.data.frame(lapply(data[, sapply(data, is.numeric)], normalize))
data_final <- cbind(data_norm, data[, !sapply(data, is.numeric)])
data_final$y <- as.factor(data_final$y)

train_norm <- data_final[split, ]
test_norm <- data_final[-split, ]

train_X <- subset(train_norm, select = -y)
train_y <- train_norm$y
test_X <- subset(test_norm, select = -y)
test_y <- test_norm$y

# Preprocessing untuk neural network: normalisasi dan target -> dummy
nn_data <- data_final
nn_train <- nn_data[split, ]
nn_test  <- nn_data[-split, ]

nn_model <- nnet(y ~ ., data = nn_train, size = 5, maxit = 200, decay = 0.01, trace = FALSE)

nn_train_prob <- predict(nn_model, nn_train, type = "raw")
nn_test_prob  <- predict(nn_model, nn_test, type = "raw")

nn_train_pred <- ifelse(nn_train_prob > 0.5, "yes", "no")
nn_test_pred  <- ifelse(nn_test_prob > 0.5, "yes", "no")

evaluate_model("Neural Network", nn_train_pred, nn_test_pred, nn_train$y, nn_test$y)

## 
## ===== Model: Neural Network =====
## 
## --- Training Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2978  227
##        yes   73  172
##                                           
##                Accuracy : 0.913           
##                  95% CI : (0.9031, 0.9222)
##     No Information Rate : 0.8843          
##     P-Value [Acc > NIR] : 2.554e-08       
##                                           
##                   Kappa : 0.4892          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.43108         
##             Specificity : 0.97607         
##          Pos Pred Value : 0.70204         
##          Neg Pred Value : 0.92917         
##              Prevalence : 0.11565         
##          Detection Rate : 0.04986         
##    Detection Prevalence : 0.07101         
##       Balanced Accuracy : 0.70358         
##                                           
##        'Positive' Class : yes             
##                                           
## 
## --- Testing Confusion Matrix ---
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  745  73
##        yes  17  26
##                                           
##                Accuracy : 0.8955          
##                  95% CI : (0.8731, 0.9151)
##     No Information Rate : 0.885           
##     P-Value [Acc > NIR] : 0.1825          
##                                           
##                   Kappa : 0.3188          
##                                           
##  Mcnemar's Test P-Value : 6.731e-09       
##                                           
##             Sensitivity : 0.26263         
##             Specificity : 0.97769         
##          Pos Pred Value : 0.60465         
##          Neg Pred Value : 0.91076         
##              Prevalence : 0.11498         
##          Detection Rate : 0.03020         
##    Detection Prevalence : 0.04994         
##       Balanced Accuracy : 0.62016         
##                                           
##        'Positive' Class : yes             
##

evaluate_model1("Neural Network", nn_train_pred, nn_test_pred, nn_train$y, nn_test$y)

## 
## ===== Model: Neural Network =====
## Training Accuracy : 0.9130435 
## Testing Accuracy  : 0.8954704 
## Training Precision: 0.7020408 
## Testing Precision : 0.6046512 
## Training Recall   : 0.4310777 
## Testing Recall    : 0.2626263 
## Training F1 Score : 0.5341615 
## Testing F1 Score  : 0.3661972

Praktikum Data Mining: Pembentukan Model Klasifikasi Data Bank Latih

Kelompok 4 3SD2 2025: Faruq, Irsan, Jernita

2025-04-14