Read Data
## Warning: package 'rmdformats' was built under R version 4.3.3
## Warning: package 'isotree' was built under R version 4.3.3
## Warning: package 'xgboost' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
##
## slice
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
## Warning: package 'readxl' was built under R version 4.3.2
## Warning: package 'fastDummies' was built under R version 4.3.3
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data.train1 <- read.csv("D:\\Campss\\Season 6\\TPM\\Data Training.csv", sep = ";")
data.test1 <- read.csv("D:\\Campss\\Season 6\\TPM\\Data Testing.csv", sep = ";")
data.train <- data.train1 %>%
mutate(across(everything(), ~ ifelse(trimws(.) == "", NA, .)))
data.test <- data.test1 %>%
mutate(across(everything(), ~ ifelse(trimws(.) == "", NA, .)))
table(data.train$y)
##
## no yes
## 36548 4640
Imputasi Dengan Modus
# Fungsi untuk menghitung modus (nilai paling sering muncul) untuk vektor
modus <- function(x) {
# Hilangkan NA dulu
x_no_na <- x[!is.na(x)]
# Nilai yang paling sering muncul
uniqx <- unique(x_no_na)
uniqx[which.max(tabulate(match(x_no_na, uniqx)))]
}
# Untuk tiap kolom, hitung modusnya lalu replace NA
data.train <- data.train %>%
mutate(across(everything(), ~ {
this_mode <- modus(.)
ifelse(is.na(.), this_mode, .)
}))
data.test <- na.omit(data.test)
One Hot Encoding (1/2)
cols <- c("job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome")
# Mengonversi kolom-kolom tersebut menjadi factor
data.train[cols] <- lapply(data.train[cols], as.factor)
data.test[cols] <- lapply(data.test[cols], as.factor)
data_train_dummy <- dummy_cols(data.train,
select_columns = names(data.train)[sapply(data.train, is.factor)],
remove_selected_columns = TRUE)
data_test_dummy <- dummy_cols(data.test,
select_columns = names(data.test)[sapply(data.test, is.factor)],
remove_selected_columns = TRUE)
# Pisahkan fitur (X) dan target (y)
X.train <- data_train_dummy %>% select(-c(y, duration))
y.train <- data_train_dummy$y
X.test <- data_test_dummy %>% select(-c(y, duration))
y.test <- data_test_dummy$y
Deteksi Outlier dengan DBSCAN
## Warning: package 'dbscan' was built under R version 4.3.3
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
X.train.scaled <- scale(X.train)
minPts_value <- 20
# Plot kNN distance plot untuk k = minPts_value
kNNdistplot(X.train.scaled, k = minPts_value)
abline(h = 10, col = "red", lty = 2)

set.seed(123)
# Jalankan DBSCAN
dbscan_result <- dbscan(X.train.scaled, eps = 10, minPts = 20 )
# Tampilkan ringkasan cluster; label 0 menunjukkan titik outlier
print(table(dbscan_result$cluster))
##
## 0 1 2
## 21 40985 182
# Identifikasi indeks titik outlier (noise)
outlier_idx <- which(dbscan_result$cluster == 0)
cat("Indeks outlier:", outlier_idx, "\n")
## Indeks outlier: 5394 6929 12384 14446 14487 16270 17740 21581 21582 22403 22643 24867 26681 26690 26757 28627 28628 35307 37356 37814 37823
# Visualisasi menggunakan PCA untuk mereduksi dimensi
pca <- prcomp(X.train.scaled)
plot(pca$x[, 1:2],
col = ifelse(dbscan_result$cluster == 0, "blue", dbscan_result$cluster + 1),
pch = ifelse(dbscan_result$cluster == 0, 4, 19),
main = "DBSCAN: Outlier (Biru) dan Cluster")

# Filter data untuk menghapus outlier
X.train_clean <- X.train[-outlier_idx, ]
y.train_clean <- y.train[-outlier_idx]
X.train <- X.train_clean
y.train <- y.train_clean
One-Hot Encoding (2/2)
X_train <- X.train
X_test <- X.test
y_train <- y.train
y_test <- y.test
X_train_numeric <- model.matrix(~ . - 1, data = X_train)
X_test_numeric <- model.matrix(~ . - 1, data = X_test)
y_train_numeric <- as.numeric(as.factor(y_train))
y_test_numeric <- as.numeric(as.factor(y_test))
y_train_numeric <- ifelse(y_train_numeric == 1, 0, 1)
y_test_numeric <- ifelse(y_test_numeric == 1, 0, 1)
length(y_train_numeric)
## [1] 41167
## [1] 41167
XGBoost CV 3-Fold
# XGBoost
set.seed(42)
# Konversi data ke format DMatrix yang dibutuhkan xgboost
dtrain <- xgb.DMatrix(data = as.matrix(X_train_numeric),
label = y_train_numeric)
dtest <- xgb.DMatrix(data = as.matrix(X_test_numeric),
label = y_test_numeric)
params <- list(
booster = "gbtree",
objective = "binary:logistic",
eval_metric = "logloss",
max_depth = 6,
eta = 0.1,
gamma = 0,
min_child_weight = 5,
subsample = 1,
colsample_bytree = 1,
scale_pos_weight = 8 # Sesuaikan dengan rasio kelas
)
cv_model <- xgb.cv(
params = params,
data = dtrain,
nrounds = 2000,
nfold = 3,
stratified = TRUE,
print_every_n = 100,
early_stopping_rounds = 50
)
## [1] train-logloss:0.664457+0.000208 test-logloss:0.664964+0.000437
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 50 rounds.
##
## [101] train-logloss:0.440976+0.001574 test-logloss:0.477492+0.004904
## [201] train-logloss:0.398956+0.000783 test-logloss:0.460118+0.003973
## [301] train-logloss:0.366537+0.002869 test-logloss:0.448266+0.006103
## [401] train-logloss:0.343992+0.002370 test-logloss:0.441510+0.005791
## [501] train-logloss:0.323250+0.004283 test-logloss:0.435008+0.004585
## [601] train-logloss:0.304977+0.002915 test-logloss:0.429974+0.005284
## [701] train-logloss:0.291321+0.002823 test-logloss:0.427270+0.004028
## [801] train-logloss:0.277854+0.003966 test-logloss:0.424470+0.004875
## [901] train-logloss:0.264695+0.003782 test-logloss:0.422275+0.004763
## [1001] train-logloss:0.253277+0.003580 test-logloss:0.420795+0.005412
## [1101] train-logloss:0.240633+0.005116 test-logloss:0.418817+0.005344
## [1201] train-logloss:0.229811+0.006593 test-logloss:0.417407+0.004892
## [1301] train-logloss:0.219665+0.005368 test-logloss:0.416257+0.004787
## Stopping. Best iteration:
## [1298] train-logloss:0.219897+0.005371 test-logloss:0.416244+0.004805
# Lihat nrounds terbaik
best_iter <- which.min(cv_model$evaluation_log$test_logloss_mean)
cat("Best iteration based on logloss:", best_iter, "\n")
## Best iteration based on logloss: 1298
# Latih model final
model <- xgb.train(
params = params,
data = dtrain,
nrounds = best_iter,
watchlist = list(train = dtrain),
verbose = 1,
print_every_n = 100
)
## [1] train-logloss:0.664968
## [101] train-logloss:0.454625
## [201] train-logloss:0.420326
## [301] train-logloss:0.394607
## [401] train-logloss:0.377540
## [501] train-logloss:0.356395
## [601] train-logloss:0.338780
## [701] train-logloss:0.325145
## [801] train-logloss:0.314257
## [901] train-logloss:0.302038
## [1001] train-logloss:0.289278
## [1101] train-logloss:0.278187
## [1201] train-logloss:0.269173
## [1298] train-logloss:0.260204
Evaluasi Model
# ---- Evaluasi pada Data Training ----
# Prediksi probabilitas pada data train
pred_prob_train <- predict(model, as.matrix(X_train_numeric))
# Konversi probabilitas ke label (threshold 0.5)
pred_label_train <- ifelse(pred_prob_train > 0.5, 1, 0)
# Tampilkan confusion matrix dan metrik evaluasi untuk training
conf_mat_train <- confusionMatrix(as.factor(pred_label_train), as.factor(y_train_numeric))
print(conf_mat_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 33764 441
## 1 2767 4195
##
## Accuracy : 0.9221
## 95% CI : (0.9194, 0.9246)
## No Information Rate : 0.8874
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6802
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9243
## Specificity : 0.9049
## Pos Pred Value : 0.9871
## Neg Pred Value : 0.6026
## Prevalence : 0.8874
## Detection Rate : 0.8202
## Detection Prevalence : 0.8309
## Balanced Accuracy : 0.9146
##
## 'Positive' Class : 0
##
# Plot ROC dan hitung AUC untuk data training
roc_train <- roc(y_train_numeric, pred_prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_train, col = "blue", main = "ROC Curve - Training Data")
auc_train <- auc(roc_train)
legend("bottomright", legend = paste("AUC =", round(auc_train, 3)), col = "blue", lwd = 2)

# ---- Evaluasi pada Data Testing ----
# Prediksi probabilitas pada data test
pred_prob_test <- predict(model, as.matrix(X_test_numeric))
# Konversi probabilitas ke label (threshold 0.5)
pred_label_test <- ifelse(pred_prob_test > 0.5, 1, 0)
# Tampilkan confusion matrix dan metrik evaluasi untuk testing
conf_mat_test <- confusionMatrix(as.factor(pred_label_test), as.factor(y_test_numeric))
print(conf_mat_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2504 35
## 1 216 335
##
## Accuracy : 0.9188
## 95% CI : (0.9086, 0.9282)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 2.365e-12
##
## Kappa : 0.6819
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9206
## Specificity : 0.9054
## Pos Pred Value : 0.9862
## Neg Pred Value : 0.6080
## Prevalence : 0.8803
## Detection Rate : 0.8104
## Detection Prevalence : 0.8217
## Balanced Accuracy : 0.9130
##
## 'Positive' Class : 0
##
# Plot ROC dan hitung AUC untuk data testing
roc_test <- roc(y_test_numeric, pred_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_test, col = "red", main = "ROC Curve - Testing Data")
auc_test <- auc(roc_test)
legend("bottomright", legend = paste("AUC =", round(auc_test, 3)), col = "red", lwd = 2)

# Plot
par(mfrow = c(1,3))
plot(roc_train, col = "blue", main = "ROC Curve - Training Data")
legend("bottomright", legend = paste("AUC =", round(auc_train, 3)), col = "blue", lwd = 2)
plot(roc_test, col = "red", main = "ROC Curve - Testing Data")
legend("bottomright", legend = paste("AUC =", round(auc_test, 3)), col = "red", lwd = 2)
f1_score <- function(TP, FP, FN) {
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1 <- 2 * precision * recall / (precision + recall)
return(f1)
}
#Training
TP <- conf_mat_train$table[1]
FP <- conf_mat_train$table[3]
FN <- conf_mat_train$table[2]
cat("F1 Score Training:", f1_score(TP, FP, FN), "\n")
## F1 Score Training: 0.9546483
#Testing
TP <- conf_mat_test$table[1]
FP <- conf_mat_test$table[3]
FN <- conf_mat_test$table[2]
cat("F1 Score Testing:", f1_score(TP, FP, FN), "\n")
## F1 Score Testing: 0.9522723
