load libraries and setup parallel processing
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
## Warning: package 'themis' was built under R version 4.5.3
## Loading required package: recipes
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
## Warning: package 'doParallel' was built under R version 4.5.3
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: iterators
## Loading required package: parallel
library(parallel)
gc() # Clear RAM
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2628236 140.4 4302872 229.8 4302872 229.8
## Vcells 4473795 34.2 10146329 77.5 7357856 56.2
cores <- parallel::detectCores()
cl <- makePSOCKcluster(cores - 1)
registerDoParallel(cl)
Plot Distribusi atau Boxplot Setelah Penanganan Outlier
# Identify
numeric_vars <- c("age", "campaign", "emp.var.rate", "cons.price.idx")
# Reshape
p_box <- df_clean %>%
select(all_of(numeric_vars)) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>%
ggplot(aes(x = Variable, y = Value, fill = Variable)) +
geom_boxplot(alpha = 0.7, outlier.color = "red", outlier.shape = 1) +
# INI KUNCINYA:
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
labs(title = "Boxplot Variabel Numerik (Skala Terpisah)",
subtitle = "Visualisasi sebaran data per variabel",
x = NULL, y = "Nilai") +
theme(legend.position = "none",
strip.text = element_text(face = "bold")) # Mempertebal nama variabel
# Density Plot untuk melihat normalitas/distribusi
p_density <- df_clean %>%
select(all_of(numeric_vars)) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Value") %>%
ggplot(aes(x = Value, fill = Variable)) +
geom_density(alpha = 0.5) +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
labs(title = "Distribusi Kepadatan (Density Plot)",
subtitle = "Melihat bentuk distribusi setelah preprocessing",
x = "Nilai", y = "Density") +
theme(legend.position = "none")
# Tampilkan Plot
print(p_box)


Dummy Encoding & Reduksi Dimensi
# Dummy Encoding
dummies <- dummyVars(~ ., data = df_clean %>% select(-y))
df_transformed <- as.data.frame(predict(dummies, newdata = df_clean))
df_transformed$y <- df_clean$y
# Hapus Kolom Near-Zero Variance
nzv_cols <- nearZeroVar(df_transformed)
if(length(nzv_cols) > 0) {
df_transformed <- df_transformed[, -nzv_cols]
cat("\n[INFO] Menghapus", length(nzv_cols), "kolom NZV.\n")
}
##
## [INFO] Menghapus 16 kolom NZV.
# Hapus Multikolinearitas (Korelasi > 0.90)
corr_matrix <- cor(df_transformed %>% select(where(is.numeric)))
highlyCorDescr <- findCorrelation(corr_matrix, cutoff = 0.90)
if(length(highlyCorDescr) > 0) {
df_transformed <- df_transformed[, -highlyCorDescr]
cat("\n[INFO] Menghapus", length(highlyCorDescr), "kolom karena multikolinearitas.\n")
}
##
## [INFO] Menghapus 7 kolom karena multikolinearitas.
Tabel Data Setelah Encoding
## Warning: package 'kableExtra' was built under R version 4.5.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# 10 baris pertama
df_sample <- head(df_transformed, 10)
# tabel
knitr::kable(
df_sample,
caption = "Cuplikan Data Setelah Dummy Encoding dan Reduksi Dimensi (10 Baris Pertama)",
align = "c"
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center"
)
Cuplikan Data Setelah Dummy Encoding dan Reduksi Dimensi (10 Baris
Pertama)
|
age
|
job.admin.
|
job.blue-collar
|
job.management
|
job.services
|
job.technician
|
marital.divorced
|
marital.married
|
marital.single
|
education.basic
|
education.high.school
|
education.professional.course
|
education.university.degree
|
housing.no
|
loan.no
|
contact.telephone
|
month.apr
|
month.aug
|
month.jul
|
month.jun
|
month.may
|
day_of_week.fri
|
day_of_week.mon
|
day_of_week.thu
|
day_of_week.tue
|
day_of_week.wed
|
campaign
|
previous
|
poutcome.failure
|
poutcome.nonexistent
|
emp.var.rate
|
cons.price.idx
|
cons.conf.idx
|
nr.employed
|
previous_contact.1
|
y
|
|
31
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
0
|
0
|
0
|
0
|
1
|
1
|
1
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
5
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
|
34
|
1
|
0
|
0
|
0
|
0
|
0
|
1
|
0
|
0
|
0
|
0
|
1
|
1
|
1
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
2
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
|
44
|
0
|
0
|
0
|
0
|
1
|
1
|
0
|
0
|
0
|
1
|
0
|
0
|
0
|
1
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
5
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
|
29
|
0
|
0
|
0
|
0
|
1
|
0
|
0
|
1
|
0
|
0
|
0
|
1
|
0
|
1
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
1
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
|
32
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
3
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
|
32
|
0
|
0
|
0
|
0
|
1
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
2
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
|
48
|
0
|
0
|
0
|
0
|
1
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
3
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
|
29
|
0
|
0
|
0
|
0
|
1
|
0
|
0
|
1
|
0
|
0
|
0
|
1
|
0
|
1
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
1
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
|
32
|
0
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
0
|
0
|
1
|
0
|
0
|
1
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
2
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
|
38
|
0
|
1
|
0
|
0
|
0
|
0
|
0
|
1
|
1
|
0
|
0
|
0
|
0
|
1
|
1
|
0
|
0
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
0
|
2
|
0
|
0
|
1
|
11
|
93994
|
-364
|
5191
|
0
|
no
|
Multi-model Training
models_list <- list()
x_train <- train_set[, names(train_set) != "y"]
y_train <- train_set$y
base_preprocess <- c("center", "scale")
# 1. RPART (Decision Tree)
cat("Decision Tree (rpart)")
## Decision Tree (rpart)
t_rpart <- system.time({
set.seed(16)
models_list$rpart <- train(x = x_train, y = y_train, method = 'rpart', metric = 'ROC', trControl = train_control, preProcess = base_preprocess, tuneLength = 10)
})
cat("Selesai dalam", round(t_rpart["elapsed"], 2), "detik\n")
## Selesai dalam 24.3 detik
# 2. RANGER (Random Forest)
cat("Random Forest (ranger)")
## Random Forest (ranger)
t_rf <- system.time({
set.seed(16)
models_list$rf <- train(x = x_train, y = y_train, method = 'ranger', metric = 'ROC', trControl = train_control, preProcess = base_preprocess, tuneLength = 3, num.threads = 1, importance = 'impurity')
})
cat("Selesai dalam", round(t_rf["elapsed"], 2), "detik\n")
## Selesai dalam 30.86 detik
# 3. SVM RADIAL
cat("SVM (svmRadial)")
## SVM (svmRadial)
t_svm <- system.time({
set.seed(16)
models_list$svmRadial <- train(x = x_train, y = y_train, method = 'svmRadial', metric = 'ROC', trControl = train_control, preProcess = base_preprocess, tuneLength = 2, prob.model = TRUE)
})
cat("Selesai dalam", round(t_svm["elapsed"], 2), "detik\n")
## Selesai dalam 18.41 detik
# 4. NEURAL NETWORK (nnet)
cat("Neural Network (nnet)")
## Neural Network (nnet)
t_nnet <- system.time({
set.seed(16)
models_list$nnet <- train(x = x_train, y = y_train, method = 'nnet', metric = 'ROC', trControl = train_control, preProcess = base_preprocess, trace = FALSE, MaxNWts = 10000)
})
cat("Selesai dalam", round(t_nnet["elapsed"], 2), "detik\n")
## Selesai dalam 5.14 detik
# 5. BAGGED CART (treebag)
cat("Bagged CART (treebag)")
## Bagged CART (treebag)
t_treebag <- system.time({
set.seed(16)
models_list$treebag <- train(x = x_train, y = y_train, method = 'treebag', metric = 'ROC', trControl = train_control, preProcess = base_preprocess)
})
cat("Selesai dalam", round(t_treebag["elapsed"], 2), "detik\n")
## Selesai dalam 4.11 detik
# 6. GRADIENT BOOSTING (gbm)
cat("Gradient Boosting (gbm)")
## Gradient Boosting (gbm)
t_gbm <- system.time({
set.seed(16)
models_list$gbm <- train(x = x_train, y = y_train, method = 'gbm', metric = 'ROC', trControl = train_control, preProcess = base_preprocess, verbose = FALSE)
})
cat("Selesai dalam", round(t_gbm["elapsed"], 2), "detik\n")
## Selesai dalam 2.81 detik
# Matikan cluster dan kembali ke sequential processing
stopCluster(cl)
registerDoSEQ()
# Hitung Total Waktu Keseluruhan
total_waktu <- t_rpart["elapsed"] + t_rf["elapsed"] + t_svm["elapsed"] + t_nnet["elapsed"] + t_treebag["elapsed"] + t_gbm["elapsed"]
cat("\n=== Training Keseluruhan Selesai dalam", round(total_waktu / 60, 2), "menit ===\n")
##
## === Training Keseluruhan Selesai dalam 1.43 menit ===
Optimalisasi Threshold
thresholds <- seq(0.15, 0.95, by = 0.01)
best_th_list <- list()
actual_yes <- valid_set$y == 'yes'
for(mn in names(models_list)) {
probs_valid <- tryCatch({
predict(models_list[[mn]], newdata = valid_set, type = 'prob')[, 'yes']
}, error = function(e) { return(rep(0, length(actual_yes))) })
if(any(is.na(probs_valid))) probs_valid[is.na(probs_valid)] <- 0
f1_max <- 0
best_th <- 0.5
for(th in thresholds) {
pred_yes <- probs_valid >= th
tp <- sum(pred_yes & actual_yes, na.rm = TRUE)
fp <- sum(pred_yes & !actual_yes, na.rm = TRUE)
fn <- sum(!pred_yes & actual_yes, na.rm = TRUE)
prec <- ifelse((tp + fp) == 0, 0, tp / (tp + fp))
rec <- ifelse((tp + fn) == 0, 0, tp / (tp + fn))
f1_val <- ifelse((prec + rec) == 0, 0, 2 * (prec * rec) / (prec + rec))
if(is.na(f1_val)) f1_val <- 0
if(f1_val > f1_max) { f1_max <- f1_val; best_th <- th }
}
best_th_list[[mn]] <- data.frame(Model = mn, Best_Threshold = best_th, F1 = f1_max)
}
## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs
results_opt <- dplyr::bind_rows(best_th_list) %>% arrange(desc(F1))
Evaluasi & Winning Model
eval_results_custom <- list()
actual_y <- valid_set$y
for(i in 1:nrow(results_opt)) {
mn <- results_opt$Model[i]
opt_th <- results_opt$Best_Threshold[i]
probs_valid <- tryCatch({
p <- predict(models_list[[mn]], newdata = valid_set, type = 'prob')
if(!is.null(dim(p)) && "yes" %in% colnames(p)) { p[, 'yes'] } else { rep(0, length(actual_y)) }
}, error = function(e) { rep(0, length(actual_y)) })
if(length(probs_valid) != length(actual_y)) probs_valid <- rep(0, length(actual_y))
probs_valid[is.na(probs_valid)] <- 0
pred_custom <- ifelse(probs_valid >= opt_th, "yes", "no")
pred_custom_factor <- factor(pred_custom, levels = c("no", "yes"))
actual_factor <- factor(actual_y, levels = c("no", "yes"))
# mode = "everything" untuk memunculkan Precision dan Specificity
cm <- confusionMatrix(pred_custom_factor, actual_factor, positive = "yes", mode = "everything")
# Memasukkan semua metrik ke dalam dataframe
eval_results_custom[[mn]] <- data.frame(
Model = mn,
Optimal_Threshold = opt_th,
Accuracy = cm$overall["Accuracy"],
Precision = cm$byClass["Precision"],
Sensitivity_Recall = cm$byClass["Sensitivity"],
Specificity = cm$byClass["Specificity"],
F1_Score = cm$byClass["F1"]
)
}
## Warning in method$prob(modelFit = modelFit, newdata = newdata, submodels =
## param): kernlab class probability calculations failed; returning NAs
# Menggabungkan hasil dan mengurutkan berdasarkan F1-Score tertinggi
eval_df_custom <- dplyr::bind_rows(eval_results_custom) %>% arrange(desc(F1_Score))
# Merapikan tampilan dengan menghapus rownames bawaan
rownames(eval_df_custom) <- NULL
# tabel lengkap
print(eval_df_custom)
## Model Optimal_Threshold Accuracy Precision Sensitivity_Recall
## 1 rf 0.62 0.8451164 0.4611231 0.6671875
## 2 gbm 0.63 0.8375027 0.4445596 0.6703125
## 3 rpart 0.72 0.8218403 0.4168988 0.7015625
## 4 nnet 0.64 0.8279313 0.4248756 0.6671875
## 5 treebag 0.65 0.8187949 0.4008222 0.6093750
## 6 svmRadial 0.50 0.8607788 NA 0.0000000
## Specificity F1_Score
## 1 0.8738944 0.5453384
## 2 0.8645438 0.5345794
## 3 0.8412939 0.5230052
## 4 0.8539297 0.5191489
## 5 0.8526662 0.4835710
## 6 1.0000000 NA
# juara sejati
true_best_model <- eval_df_custom$Model[1]
true_best_th <- eval_df_custom$Optimal_Threshold[1]
cat('\nJUARA SEJATI:', true_best_model, '| F1-Score:', round(eval_df_custom$F1_Score[1], 4), '| Threshold:', true_best_th, '\n')
##
## JUARA SEJATI: rf | F1-Score: 0.5453 | Threshold: 0.62
Visualisasi Evaluasi Model
Optimalisasi Treshold
# Mengambil probabilitas prediksi dari model juara pada data validasi
final_model <- models_list[[true_best_model]]
probs_valid <- predict(final_model, newdata = valid_set, type = 'prob')[, 'yes']
actual_yes <- valid_set$y == 'yes'
# Kalkulasi ulang metrik untuk plotting
metrics_list <- list()
for(th in seq(0.30, 0.85, by = 0.02)) {
pred_yes <- probs_valid >= th
tp <- sum(pred_yes & actual_yes, na.rm = TRUE)
fp <- sum(pred_yes & !actual_yes, na.rm = TRUE)
fn <- sum(!pred_yes & actual_yes, na.rm = TRUE)
prec <- ifelse((tp + fp) == 0, 0, tp / (tp + fp))
rec <- ifelse((tp + fn) == 0, 0, tp / (tp + fn))
f1_val <- ifelse((prec + rec) == 0, 0, 2 * (prec * rec) / (prec + rec))
metrics_list[[length(metrics_list) + 1]] <- data.frame(
Threshold = th, Precision = prec, Sensitivity = rec, F1_Score = f1_val
)
}
metrics_df <- dplyr::bind_rows(metrics_list)
# Reshape data untuk ggplot
metrics_long <- metrics_df %>%
pivot_longer(cols = c(Precision, Sensitivity, F1_Score), names_to = "Metric", values_to = "Value")
# Buat Plot
p_opt <- ggplot(metrics_long, aes(x = Threshold, y = Value, color = Metric)) +
geom_line(size = 1) +
geom_point(size = 2) +
geom_vline(xintercept = true_best_th, linetype = "dashed", color = "black", size = 1) +
annotate("text", x = true_best_th + 0.01, y = 0.8, label = paste("Optimal:", true_best_th), hjust = 0, fontface = "bold") +
theme_minimal() +
labs(title = paste("Threshold Optimization Plot (", true_best_model, ")", sep=""),
subtitle = "Pergerakan F1-Score, Sensitivity, dan Precision",
x = "Probability Threshold", y = "Score") +
scale_color_brewer(palette = "Set1")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ROC/AUC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Ambil probabilitas dari model juara (Random Forest)
final_model <- models_list[[true_best_model]]
probs_valid <- predict(final_model, newdata = valid_set, type = 'prob')[, 'yes']
# Buat objek ROC
roc_obj <- roc(valid_set$y, probs_valid, levels = c("no", "yes"))
## Setting direction: controls < cases
auc_val <- auc(roc_obj)
# Plot Kurva ROC
plot(roc_obj, main = paste("ROC Curve -", true_best_model),
col = "#1f77b4", lwd = 3,
print.auc = TRUE,
auc.polygon = TRUE,
grid = TRUE,
auc.polygon.col = "#d3d3d355")

Heatmap Confusion Matrix
# Terapkan threshold pemenang
pred_custom <- ifelse(probs_valid >= true_best_th, "yes", "no")
pred_custom_factor <- factor(pred_custom, levels = c("no", "yes"))
actual_factor <- factor(valid_set$y, levels = c("no", "yes"))
cm <- confusionMatrix(pred_custom_factor, actual_factor, positive = "yes")
cm_table <- as.data.frame(cm$table)
p_cm <- ggplot(data = cm_table, aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Freq), color = "white") +
scale_fill_gradient(low = "lightblue", high = "dodgerblue4") +
geom_text(aes(label = Freq), vjust = 0.5, fontface = "bold",
color = ifelse(cm_table$Freq > (max(cm_table$Freq)*0.4), "white", "black"), size = 6) +
theme_minimal() +
labs(title = paste("Confusion Matrix Heatmap (Threshold ", true_best_th, ")", sep=""),
subtitle = "Evaluasi pada Validation Set",
x = "Actual Class (Reference)",
y = "Predicted Class") +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14, face = "bold"))
print(p_cm)

Variable Importance Plot
## Warning: package 'gbm' was built under R version 4.5.3
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# Hitung Tingkat Kepentingan Fitur dari model juara
var_imp_model <- varImp(final_model, scale = TRUE)
imp_df <- as.data.frame(var_imp_model$importance)
imp_df$Feature <- rownames(imp_df)
# Ekstrak nama kolom metrik
metric_col <- names(imp_df)[1]
# Ambil Top 15 Fitur
imp_top15 <- imp_df %>%
arrange(desc(!!sym(metric_col))) %>%
head(15)
p_varimp <- ggplot(imp_top15, aes(x = reorder(Feature, !!sym(metric_col)), y = !!sym(metric_col))) +
geom_col(fill = "steelblue", color = "black", alpha = 0.8) +
coord_flip() +
theme_minimal() +
labs(title = paste("Top 15 Variable Importance (", true_best_model, ")", sep=""),
subtitle = "Fitur paling berpengaruh terhadap prediksi target",
x = "Fitur (Variabel)",
y = "Tingkat Kepentingan (Importance Score)") +
theme(axis.text.y = element_text(size = 10, face = "bold"))
print(p_varimp)

Alignment dengan Dataset B & Final predict
df_B <- read.csv("dataset_B.csv", stringsAsFactors = TRUE)
df_B_clean <- df_B %>%
# Ubah unknown menjadi NA
mutate(across(where(is.factor), ~na_if(as.character(.), "unknown"))) %>%
# Lakukan transformasi lainnya yang sama persis di Dataset A
mutate(education = ifelse(grepl("basic", as.character(education)), "basic", as.character(education))) %>%
mutate(
previous_contact = ifelse(pdays != 999, 1, 0),
previous_contact = as.factor(previous_contact)
) %>%
select(-pdays) %>%
mutate(year = case_when(
emp.var.rate > 0 ~ 2008,
emp.var.rate <= 0 & emp.var.rate > -2 ~ 2009,
emp.var.rate <= -2 ~ 2010
)) %>%
mutate(year = as.factor(year)) %>%
mutate(across(where(is.character), as.factor)) %>%
select(-duration)
# Alignment Factor Levels antara A dan B (hanya untuk kolom yang ada di kedua dataset)
for(col in names(df_clean)) {
if(is.factor(df_clean[[col]]) && col %in% names(df_B_clean)) {
# Paksa level kategori di B agar sama persis dengan A
df_B_clean[[col]] <- factor(df_B_clean[[col]], levels = levels(df_clean[[col]]))
}
}
# Transformasi Dummy
df_B_transformed <- as.data.frame(predict(dummies, newdata = df_B_clean, na.action = na.pass))
# Ganti nilai NA hasil dari 'unknown' di matriks dummy menjadi 0
df_B_transformed[is.na(df_B_transformed)] <- 0
# Hapus kolom NZV jika ada
if(exists("nzv_cols") && length(nzv_cols) > 0) {
df_B_transformed <- df_B_transformed[, -nzv_cols, drop = FALSE]
}
# apus kolom yang memiliki multikolinearitas
if(exists("highlyCorDescr") && length(highlyCorDescr) > 0) {
df_B_transformed <- df_B_transformed[, -highlyCorDescr, drop = FALSE]
}
# PREDIKSI
final_model <- models_list[[true_best_model]]
pred_prob_B <- predict(final_model, newdata = df_B_transformed, type = "prob")[, "yes"]
pred_class_B_custom <- ifelse(pred_prob_B >= true_best_th, "yes", "no")
# Membuat dataframe baru hanya dengan kolom 'id' dan 'y_pred'
df_submission <- data.frame(
id = 1:nrow(df_B),
y_pred = pred_class_B_custom
)
# Simpan Hasil Sesuai Format
write.csv(df_submission, "Hasil_Prediksi_Format_Submission.csv", row.names = FALSE)
cat("\nFile 'Hasil_Prediksi_Format_Submission.csv' berhasil dibuat sesuai format tugas.\n")
##
## File 'Hasil_Prediksi_Format_Submission.csv' berhasil dibuat sesuai format tugas.