TPM - XGBoost Classification Model with DBSCAN Outlier Filtering

Read Data

library(rmdformats)
## Warning: package 'rmdformats' was built under R version 4.3.3
library(isotree)
## Warning: package 'isotree' was built under R version 4.3.3
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
## 
##     slice
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.2
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.3.3
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
data.train1 <- read.csv("D:\\Campss\\Season 6\\TPM\\Data Training.csv", sep = ";")
data.test1 <- read.csv("D:\\Campss\\Season 6\\TPM\\Data Testing.csv", sep = ";")
data.train <- data.train1 %>%
  mutate(across(everything(), ~ ifelse(trimws(.) == "", NA, .)))
data.test <- data.test1 %>%
  mutate(across(everything(), ~ ifelse(trimws(.) == "", NA, .)))
table(data.train$y)
## 
##    no   yes 
## 36548  4640

Imputasi Dengan Modus

# Fungsi untuk menghitung modus (nilai paling sering muncul) untuk vektor
modus <- function(x) {
  # Hilangkan NA dulu
  x_no_na <- x[!is.na(x)]
  # Nilai yang paling sering muncul
  uniqx <- unique(x_no_na)
  uniqx[which.max(tabulate(match(x_no_na, uniqx)))]
}

# Untuk tiap kolom, hitung modusnya lalu replace NA
data.train <- data.train %>%
  mutate(across(everything(), ~ {
    this_mode <- modus(.)
    ifelse(is.na(.), this_mode, .)
  }))

data.test <- na.omit(data.test)

One Hot Encoding (1/2)

cols <- c("job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome")

# Mengonversi kolom-kolom tersebut menjadi factor
data.train[cols] <- lapply(data.train[cols], as.factor)
data.test[cols] <- lapply(data.test[cols], as.factor)

data_train_dummy <- dummy_cols(data.train, 
                              select_columns = names(data.train)[sapply(data.train, is.factor)],
                              remove_selected_columns = TRUE)
data_test_dummy <- dummy_cols(data.test, 
                              select_columns = names(data.test)[sapply(data.test, is.factor)],
                              remove_selected_columns = TRUE)

# Pisahkan fitur (X) dan target (y)
X.train <- data_train_dummy %>% select(-c(y, duration))
y.train <- data_train_dummy$y

X.test <- data_test_dummy %>% select(-c(y, duration))
y.test <- data_test_dummy$y

Deteksi Outlier dengan DBSCAN

library(dbscan)
## Warning: package 'dbscan' was built under R version 4.3.3
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
X.train.scaled <- scale(X.train)
minPts_value <- 20

# Plot kNN distance plot untuk k = minPts_value
kNNdistplot(X.train.scaled, k = minPts_value)
abline(h = 10, col = "red", lty = 2)

set.seed(123)

# Jalankan DBSCAN
dbscan_result <- dbscan(X.train.scaled, eps = 10, minPts = 20 )

# Tampilkan ringkasan cluster; label 0 menunjukkan titik outlier
print(table(dbscan_result$cluster))
## 
##     0     1     2 
##    21 40985   182
# Identifikasi indeks titik outlier (noise)
outlier_idx <- which(dbscan_result$cluster == 0)
cat("Indeks outlier:", outlier_idx, "\n")
## Indeks outlier: 5394 6929 12384 14446 14487 16270 17740 21581 21582 22403 22643 24867 26681 26690 26757 28627 28628 35307 37356 37814 37823
# Visualisasi menggunakan PCA untuk mereduksi dimensi
pca <- prcomp(X.train.scaled)
plot(pca$x[, 1:2],
     col = ifelse(dbscan_result$cluster == 0, "blue", dbscan_result$cluster + 1),
     pch = ifelse(dbscan_result$cluster == 0, 4, 19),
     main = "DBSCAN: Outlier (Biru) dan Cluster")

# Filter data untuk menghapus outlier
X.train_clean <- X.train[-outlier_idx, ]
y.train_clean <- y.train[-outlier_idx]

X.train <- X.train_clean
y.train <- y.train_clean

One-Hot Encoding (2/2)

X_train <- X.train
X_test <- X.test
y_train <- y.train
y_test <- y.test

X_train_numeric <- model.matrix(~ . - 1, data = X_train)
X_test_numeric  <- model.matrix(~ . - 1, data = X_test)
y_train_numeric <- as.numeric(as.factor(y_train))
y_test_numeric <- as.numeric(as.factor(y_test))
y_train_numeric <- ifelse(y_train_numeric == 1, 0, 1)
y_test_numeric <- ifelse(y_test_numeric == 1, 0, 1)

length(y_train_numeric)
## [1] 41167
nrow(X_train_numeric)
## [1] 41167

XGBoost CV 3-Fold

# XGBoost
set.seed(42)

# Konversi data ke format DMatrix yang dibutuhkan xgboost
dtrain <- xgb.DMatrix(data = as.matrix(X_train_numeric), 
                      label = y_train_numeric)
dtest  <- xgb.DMatrix(data = as.matrix(X_test_numeric),  
                      label = y_test_numeric)


params <- list(
  booster = "gbtree",
  objective = "binary:logistic",
  eval_metric = "logloss",
  max_depth = 6,
  eta = 0.1,
  gamma = 0,
  min_child_weight = 5,
  subsample = 1,
  colsample_bytree = 1,
  scale_pos_weight = 8  # Sesuaikan dengan rasio kelas
)

cv_model <- xgb.cv(
  params = params,
  data = dtrain,
  nrounds = 2000,
  nfold = 3,
  stratified = TRUE,
  print_every_n = 100,
  early_stopping_rounds = 50
)
## [1]  train-logloss:0.664457+0.000208 test-logloss:0.664964+0.000437 
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 50 rounds.
## 
## [101]    train-logloss:0.440976+0.001574 test-logloss:0.477492+0.004904 
## [201]    train-logloss:0.398956+0.000783 test-logloss:0.460118+0.003973 
## [301]    train-logloss:0.366537+0.002869 test-logloss:0.448266+0.006103 
## [401]    train-logloss:0.343992+0.002370 test-logloss:0.441510+0.005791 
## [501]    train-logloss:0.323250+0.004283 test-logloss:0.435008+0.004585 
## [601]    train-logloss:0.304977+0.002915 test-logloss:0.429974+0.005284 
## [701]    train-logloss:0.291321+0.002823 test-logloss:0.427270+0.004028 
## [801]    train-logloss:0.277854+0.003966 test-logloss:0.424470+0.004875 
## [901]    train-logloss:0.264695+0.003782 test-logloss:0.422275+0.004763 
## [1001]   train-logloss:0.253277+0.003580 test-logloss:0.420795+0.005412 
## [1101]   train-logloss:0.240633+0.005116 test-logloss:0.418817+0.005344 
## [1201]   train-logloss:0.229811+0.006593 test-logloss:0.417407+0.004892 
## [1301]   train-logloss:0.219665+0.005368 test-logloss:0.416257+0.004787 
## Stopping. Best iteration:
## [1298]   train-logloss:0.219897+0.005371 test-logloss:0.416244+0.004805
# Lihat nrounds terbaik
best_iter <- which.min(cv_model$evaluation_log$test_logloss_mean)
cat("Best iteration based on logloss:", best_iter, "\n")
## Best iteration based on logloss: 1298
# Latih model final
model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = best_iter,
  watchlist = list(train = dtrain),
  verbose = 1,
  print_every_n = 100
)
## [1]  train-logloss:0.664968 
## [101]    train-logloss:0.454625 
## [201]    train-logloss:0.420326 
## [301]    train-logloss:0.394607 
## [401]    train-logloss:0.377540 
## [501]    train-logloss:0.356395 
## [601]    train-logloss:0.338780 
## [701]    train-logloss:0.325145 
## [801]    train-logloss:0.314257 
## [901]    train-logloss:0.302038 
## [1001]   train-logloss:0.289278 
## [1101]   train-logloss:0.278187 
## [1201]   train-logloss:0.269173 
## [1298]   train-logloss:0.260204

Evaluasi Model

# ---- Evaluasi pada Data Training ----
# Prediksi probabilitas pada data train
pred_prob_train <- predict(model, as.matrix(X_train_numeric))
# Konversi probabilitas ke label (threshold 0.5)
pred_label_train <- ifelse(pred_prob_train > 0.5, 1, 0)

# Tampilkan confusion matrix dan metrik evaluasi untuk training
conf_mat_train <- confusionMatrix(as.factor(pred_label_train), as.factor(y_train_numeric))
print(conf_mat_train)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 33764   441
##          1  2767  4195
##                                           
##                Accuracy : 0.9221          
##                  95% CI : (0.9194, 0.9246)
##     No Information Rate : 0.8874          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6802          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9243          
##             Specificity : 0.9049          
##          Pos Pred Value : 0.9871          
##          Neg Pred Value : 0.6026          
##              Prevalence : 0.8874          
##          Detection Rate : 0.8202          
##    Detection Prevalence : 0.8309          
##       Balanced Accuracy : 0.9146          
##                                           
##        'Positive' Class : 0               
## 
# Plot ROC dan hitung AUC untuk data training
roc_train <- roc(y_train_numeric, pred_prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_train, col = "blue", main = "ROC Curve - Training Data")
auc_train <- auc(roc_train)
legend("bottomright", legend = paste("AUC =", round(auc_train, 3)), col = "blue", lwd = 2)

# ---- Evaluasi pada Data Testing ----

# Prediksi probabilitas pada data test
pred_prob_test <- predict(model, as.matrix(X_test_numeric))
# Konversi probabilitas ke label (threshold 0.5)
pred_label_test <- ifelse(pred_prob_test > 0.5, 1, 0)

# Tampilkan confusion matrix dan metrik evaluasi untuk testing
conf_mat_test <- confusionMatrix(as.factor(pred_label_test), as.factor(y_test_numeric))
print(conf_mat_test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2504   35
##          1  216  335
##                                           
##                Accuracy : 0.9188          
##                  95% CI : (0.9086, 0.9282)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 2.365e-12       
##                                           
##                   Kappa : 0.6819          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9206          
##             Specificity : 0.9054          
##          Pos Pred Value : 0.9862          
##          Neg Pred Value : 0.6080          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8104          
##    Detection Prevalence : 0.8217          
##       Balanced Accuracy : 0.9130          
##                                           
##        'Positive' Class : 0               
## 
# Plot ROC dan hitung AUC untuk data testing
roc_test <- roc(y_test_numeric, pred_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_test, col = "red", main = "ROC Curve - Testing Data")
auc_test <- auc(roc_test)
legend("bottomright", legend = paste("AUC =", round(auc_test, 3)), col = "red", lwd = 2)

# Plot
par(mfrow = c(1,3))
plot(roc_train, col = "blue", main = "ROC Curve - Training Data")
legend("bottomright", legend = paste("AUC =", round(auc_train, 3)), col = "blue", lwd = 2)

plot(roc_test, col = "red", main = "ROC Curve - Testing Data")
legend("bottomright", legend = paste("AUC =", round(auc_test, 3)), col = "red", lwd = 2)


f1_score <- function(TP, FP, FN) {
  precision <- TP / (TP + FP)
  recall <- TP / (TP + FN)
  f1 <- 2 * precision * recall / (precision + recall)
  return(f1)
}

#Training
TP <- conf_mat_train$table[1]
FP <- conf_mat_train$table[3]
FN <- conf_mat_train$table[2]
cat("F1 Score Training:", f1_score(TP, FP, FN), "\n")
## F1 Score Training: 0.9546483
#Testing
TP <- conf_mat_test$table[1]
FP <- conf_mat_test$table[3]
FN <- conf_mat_test$table[2]
cat("F1 Score Testing:", f1_score(TP, FP, FN), "\n")
## F1 Score Testing: 0.9522723