library(rpart) # Decision Tree
library(rpart.plot) # Visualisasi Decision Tree
library(nnet) # Regresi Logistik Multinomial
library(randomForest) # Random Forest
library(themis) # SMOTE (via recipes)
library(recipes) # Pipeline pra-pemrosesan
library(caret) # CV, tuning, confusion matrix
library(pROC) # AUC-ROC
library(ggplot2) # Visualisasi
library(dplyr) # Manipulasi data
library(reshape2) # Melt untuk plot
library(scales) # Format label
# install.packages(c("rpart","rpart.plot","nnet","randomForest",
# "themis","recipes","caret","pROC",
# "ggplot2","dplyr","reshape2","scales"))Seed digunakan agar seluruh proses yang melibatkan keacakan (split data, SMOTE, cross-validation, model berbasis pohon) menghasilkan hasil yang sama setiap kali dijalankan ulang.
## Seed aktif: 42
df <- read.csv("Occupancy_Estimation.csv",
stringsAsFactors = FALSE)
cat("Dataset dimuat:", nrow(df), "baris x", ncol(df), "kolom\n")## Dataset dimuat: 10129 baris x 19 kolom
## Missing values : 0
df_bersih <- df %>% select(-Date, -Time)
# Gunakan prefix "kelas_" agar level menjadi valid R variable name
# (caret membutuhkan level faktor yang valid sebagai nama variabel)
LEVELS <- c("kelas_0", "kelas_1", "kelas_2", "kelas_3")
df_bersih$Room_Occupancy_Count <- factor(
paste0("kelas_", df_bersih$Room_Occupancy_Count),
levels = LEVELS
)
cat("Fitur prediktor :", ncol(df_bersih) - 1, "\n")## Fitur prediktor : 16
## Kelas target : kelas_0 kelas_1 kelas_2 kelas_3
set.seed(SEED)
idx_latih <- createDataPartition(df_bersih$Room_Occupancy_Count,
p = 0.8, list = FALSE)
data_latih <- df_bersih[ idx_latih, ]
data_uji <- df_bersih[-idx_latih, ]
# Variabel uji — digunakan oleh SEMUA model (tanpa standarisasi)
X_uji <- data_uji %>% select(-Room_Occupancy_Count)
y_uji <- data_uji$Room_Occupancy_Count
cat("Data latih :", nrow(data_latih), "observasi\n")## Data latih : 8106 observasi
## Data uji : 2023 observasi
##
## Distribusi kelas — Data Latih:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 368 599 556
##
## Distribusi kelas — Data Uji:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 1645 91 149 138
Mengapa SMOTE?
Kelas 0 (kosong) mencakup ~81% data. Tanpa penanganan, model cenderung mengabaikan kelas minoritas.
SMOTE membangkitkan sampel sintetis untuk kelas 1, 2, dan 3 melalui interpolasi di ruang fitur — hanya diterapkan pada data latih agar data uji tetap mencerminkan kondisi nyata.Catatan: Seluruh model menggunakan data yang sama (hasil SMOTE, tanpa standarisasi) agar perbandingan antar model berlaku adil.
set.seed(SEED)
rec_smote <- recipe(Room_Occupancy_Count ~ ., data = data_latih) %>%
step_smote(Room_Occupancy_Count, over_ratio = 1, neighbors = 5)
data_latih_s <- rec_smote %>%
prep() %>%
bake(new_data = NULL)
# Pastikan level faktor konsisten
data_latih_s$Room_Occupancy_Count <- factor(
data_latih_s$Room_Occupancy_Count,
levels = LEVELS
)
cat("Distribusi sebelum SMOTE:\n")## Distribusi sebelum SMOTE:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 368 599 556
##
## Distribusi setelah SMOTE:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 6583 6583 6583
df_smote_vis <- rbind(
data.frame(Kondisi = "Sebelum SMOTE",
Kelas = as.character(data_latih$Room_Occupancy_Count)),
data.frame(Kondisi = "Setelah SMOTE",
Kelas = as.character(data_latih_s$Room_Occupancy_Count))
)
df_smote_vis$Kondisi <- factor(df_smote_vis$Kondisi,
levels = c("Sebelum SMOTE","Setelah SMOTE"))
ggplot(df_smote_vis, aes(x = Kelas, fill = Kelas)) +
geom_bar(color = "black", width = 0.6) +
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.4, size = 3.8, fontface = "bold") +
facet_wrap(~Kondisi) +
scale_fill_manual(values = c("kelas_0"="#4472C4","kelas_1"="#ED7D31",
"kelas_2"="#70AD47","kelas_3"="#E74C3C")) +
labs(title = "Distribusi Kelas Sebelum dan Setelah SMOTE",
subtitle = "SMOTE hanya diterapkan pada data latih",
x = "Kelas Penghuni", y = "Frekuensi", fill = "Kelas") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")# ── Fungsi hitung AUC-ROC weighted-average multiclass ──────────────────────
hitung_auc <- function(y_aktual, prob_mat) {
# prob_mat : matrix/data.frame dengan kolom = LEVELS, baris = observasi
n_total <- length(y_aktual)
aucs <- numeric(length(LEVELS))
bobot <- numeric(length(LEVELS))
for (i in seq_along(LEVELS)) {
kls <- LEVELS[i]
bin_aktual <- as.integer(y_aktual == kls)
bobot[i] <- sum(bin_aktual) / n_total
# Lewati kelas yang tidak punya observasi positif
if (sum(bin_aktual) == 0 || sum(bin_aktual) == n_total) {
aucs[i] <- NA
next
}
roc_obj <- roc(bin_aktual, prob_mat[, kls], quiet = TRUE)
aucs[i] <- as.numeric(auc(roc_obj))
}
# weighted average (abaikan kelas NA)
valid <- !is.na(aucs)
auc_w <- sum(aucs[valid] * bobot[valid]) / sum(bobot[valid])
return(list(auc_weighted = auc_w, auc_per_kelas = setNames(aucs, LEVELS)))
}
# ── Fungsi evaluasi utama ───────────────────────────────────────────────────
evaluasi <- function(y_aktual, y_pred, prob_mat = NULL, nama_model) {
y_aktual <- factor(as.character(y_aktual), levels = LEVELS)
y_pred <- factor(as.character(y_pred), levels = LEVELS)
cm <- confusionMatrix(y_pred, y_aktual)
# Balanced Accuracy per kelas → macro-average
bal_acc <- mean(cm$byClass[, "Balanced Accuracy"], na.rm = TRUE)
pre <- mean(cm$byClass[, "Precision"], na.rm = TRUE)
rec <- mean(cm$byClass[, "Recall"], na.rm = TRUE)
f1 <- mean(cm$byClass[, "F1"], na.rm = TRUE)
# AUC-ROC weighted (butuh probabilitas)
auc_w <- NA
auc_per <- NULL
if (!is.null(prob_mat)) {
auc_res <- hitung_auc(y_aktual, prob_mat)
auc_w <- auc_res$auc_weighted
auc_per <- auc_res$auc_per_kelas
}
cat(sprintf("\n========== %s ==========\n", nama_model))
cat(sprintf(" Balanced Accuracy : %.4f (%.2f%%)\n", bal_acc, bal_acc * 100))
cat(sprintf(" Presisi (macro) : %.4f\n", pre))
cat(sprintf(" Recall (macro) : %.4f\n", rec))
cat(sprintf(" F1-Score (macro) : %.4f\n", f1))
if (!is.na(auc_w)) {
cat(sprintf(" AUC-ROC (weighted): %.4f\n", auc_w))
cat(" AUC per kelas :", round(auc_per, 4), "\n")
}
cat("\nConfusion Matrix:\n")
print(cm$table)
list(cm = cm, bal_acc = bal_acc, presisi = pre,
recall = rec, f1 = f1, auc = auc_w)
}5-Fold Stratified CV digunakan untuk memilih hyperparameter terbaik. Metric tuning menggunakan ROC (AUC-ROC macro) agar konsisten dengan metrik evaluasi akhir.
# Setup kontrol untuk Cross-Validation
ctrl_cv_lengkap <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE, # Wajib agar bisa menghitung AUC
summaryFunction = multiClassSummary, # Menghitung AUC, Accuracy, Kappa untuk multi-class
savePredictions = "final"
)cp (complexity parameter): mengontrol ukuran pohon.
Nilai kecil → pohon lebih dalam (risiko overfit),
nilai besar → pohon lebih sederhana (risiko underfit).
Tuning didasarkan pada AUC-ROC pada 5-Fold CV.
set.seed(SEED)
grid_dt <- expand.grid(cp = c(0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05))
tuning_dt <- train(
Room_Occupancy_Count ~ .,
data = data_latih_s,
method = "rpart",
trControl = ctrl_cv_lengkap, # Pastikan ini pakai yang ada classProbs=T
tuneGrid = grid_dt,
metric = "AUC"
)
cat("Hasil Tuning Decision Tree:\n")## Hasil Tuning Decision Tree:
## cp AUC Accuracy Kappa
## 1 1e-04 0.9988971 0.9955566 0.9940755
## 2 5e-04 0.9983245 0.9924425 0.9899234
## 3 1e-03 0.9973368 0.9887966 0.9850622
## 4 5e-03 0.9947655 0.9758086 0.9677448
## 5 1e-02 0.9901377 0.9541239 0.9388319
## 6 5e-02 0.9532082 0.8866776 0.8489036
##
## CP terbaik: 1e-04
Pruning dilakukan dengan kombinasi: - Pre-pruning:
maxdepth = 5,minsplit = 20,minbucket = 7untuk mencegah pohon tumbuh terlalu dalam. - Post-pruning:prune()menggunakancpterbaik dari CV untuk memangkas cabang yang tidak signifikan.
set.seed(SEED)
cp_terbaik <- tuning_dt$bestTune$cp
# Latih dengan pre-pruning
model_dt_full <- rpart(
Room_Occupancy_Count ~ .,
data = data_latih_s,
method = "class",
control = rpart.control(
cp = 0.00001, # biarkan tumbuh dulu, lalu post-prune
maxdepth = 5, # pre-pruning: batas kedalaman
minsplit = 20, # pre-pruning: min obs untuk split
minbucket = 7 # pre-pruning: min obs di daun
)
)
# Post-pruning: pangkas dengan cp terbaik dari CV
model_dt <- prune(model_dt_full, cp = cp_terbaik)
cat("Decision Tree dilatih & dipangkas dengan cp =", cp_terbaik, "\n")## Decision Tree dilatih & dipangkas dengan cp = 1e-04
## Jumlah node setelah pruning: 43
##
## Classification tree:
## rpart(formula = Room_Occupancy_Count ~ ., data = data_latih_s,
## method = "class", control = rpart.control(cp = 1e-05, maxdepth = 5,
## minsplit = 20, minbucket = 7))
##
## Variables actually used in tree construction:
## [1] S1_Light S1_Sound S1_Temp S2_Light S2_Temp
## [6] S3_Light S3_Sound S3_Temp S5_CO2 S5_CO2_Slope
## [11] S7_PIR
##
## Root node error: 19749/26332 = 0.75
##
## n= 26332
##
## CP nsplit rel error xerror xstd
## 1 0.31297787 0 1.000000 1.008861 0.0035258
## 2 0.24765811 1 0.687022 0.688946 0.0041060
## 3 0.12658869 2 0.439364 0.439465 0.0038624
## 4 0.08309281 3 0.312775 0.313889 0.0034860
## 5 0.07904198 4 0.229683 0.230796 0.0031086
## 6 0.03691326 5 0.150641 0.150843 0.0026027
## 7 0.02724189 6 0.113727 0.113930 0.0022969
## 8 0.01316522 7 0.086485 0.086688 0.0020259
## 9 0.01235506 8 0.073320 0.076966 0.0019163
## 10 0.00886121 9 0.060965 0.061168 0.0017191
## 11 0.00825358 10 0.052104 0.054079 0.0016209
## 12 0.00693706 11 0.043850 0.044610 0.0014776
## 13 0.00450656 12 0.036913 0.038280 0.0013721
## 14 0.00151906 13 0.032407 0.033774 0.0012911
## 15 0.00136716 14 0.030888 0.032154 0.0012605
## 16 0.00111398 15 0.029520 0.030533 0.0012291
## 17 0.00070890 16 0.028407 0.029622 0.0012110
## 18 0.00050635 17 0.027698 0.028862 0.0011957
## 19 0.00025318 18 0.027191 0.028660 0.0011916
## 20 0.00015191 19 0.026938 0.028660 0.0011916
## 21 0.00001000 21 0.026634 0.028204 0.0011823
library(rpart.plot)
rpart.plot(
tuning_dt$finalModel, # GANTI model_dt MENJADI tuning_dt$finalModel
type = 2,
extra = 104,
fallen.leaves = TRUE,
main = "Decision Tree (Pruned) - Estimasi Penghuni Ruangan",
box.palette = list("#4472C4", "#ED7D31", "#70AD47", "#FF0000"),
shadow.col = "gray",
cex = 0.75,
tweak = 1.1
)# Waktu prediksi
waktu_pred_dt <- system.time(
pred_dt <- predict(model_dt, X_uji, type = "class")
)
# Probabilitas untuk AUC-ROC
prob_dt <- predict(model_dt, X_uji, type = "prob")
prob_dt <- as.data.frame(prob_dt)
colnames(prob_dt) <- LEVELS
hasil_dt <- evaluasi(y_uji, pred_dt, prob_dt, "Decision Tree")##
## ========== Decision Tree ==========
## Balanced Accuracy : 0.9829 (98.29%)
## Presisi (macro) : 0.9685
## Recall (macro) : 0.9687
## F1-Score (macro) : 0.9686
## AUC-ROC (weighted): 0.9974
## AUC per kelas : 0.9985 0.993 0.9908 0.9937
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1643 0 0 1
## kelas_1 0 88 2 0
## kelas_2 1 2 143 6
## kelas_3 1 1 4 131
# Ukuran model: jumlah node
ukuran_dt <- nrow(model_dt$frame)
cat(sprintf("\nJumlah node Decision Tree : %d\n", ukuran_dt))##
## Jumlah node Decision Tree : 43
## Waktu prediksi (detik) : 0.0000
Catatan: Regresi Logistik menggunakan data yang sama dengan model lain (hasil SMOTE, tanpa standarisasi) agar perbandingan antar model berlaku adil.
decay (regularisasi L2): mencegah overfitting dengan memberikan penalti pada koefisien yang besar. Tuning didasarkan pada AUC-ROC.
set.seed(SEED)
grid_rl <- expand.grid(decay = c(0.0001, 0.001, 0.01, 0.05, 0.1, 0.5))
tuning_rl <- train(
Room_Occupancy_Count ~ .,
data = data_latih_s, # data yang sama dengan model lain (adil)
method = "multinom",
trControl = ctrl_cv_lengkap,
tuneGrid = grid_rl,
maxit = 200,
MaxNWts = 5000,
trace = FALSE,
metric = "AUC" # tuning berdasarkan AUC-ROC
)
cat("Hasil Tuning Regresi Logistik:\n")## Hasil Tuning Regresi Logistik:
print(tuning_rl$results[, intersect(c("decay","AUC","ROC","Accuracy","Kappa"), names(tuning_rl$results))])## decay AUC Accuracy Kappa
## 1 1e-04 0.9996347 0.9898602 0.9864803
## 2 1e-03 0.9996208 0.9897843 0.9863790
## 3 1e-02 0.9995858 0.9897842 0.9863789
## 4 5e-02 0.9995358 0.9888728 0.9851637
## 5 1e-01 0.9995014 0.9884170 0.9845560
## 6 5e-01 0.9993671 0.9872018 0.9829357
##
## Decay terbaik: 1e-04
set.seed(SEED)
decay_terbaik <- tuning_rl$bestTune$decay
waktu_latih_rl <- system.time(
suppressMessages(
model_rl <- multinom(
Room_Occupancy_Count ~ .,
data = data_latih_s, # data yang sama (tanpa standarisasi)
MaxNWts = 5000,
maxit = 200,
trace = FALSE,
decay = decay_terbaik
)
)
)
cat("Regresi Logistik dilatih dengan decay =", decay_terbaik, "\n")## Regresi Logistik dilatih dengan decay = 1e-04
## Waktu pelatihan (detik): 7.64
waktu_pred_rl <- system.time(
pred_rl <- predict(model_rl, X_uji, type = "class")
)
prob_rl <- predict(model_rl, X_uji, type = "probs")
prob_rl <- as.data.frame(prob_rl)
colnames(prob_rl) <- LEVELS
hasil_rl <- evaluasi(y_uji, pred_rl, prob_rl, "Regresi Logistik")##
## ========== Regresi Logistik ==========
## Balanced Accuracy : 0.9881 (98.81%)
## Presisi (macro) : 0.9870
## Recall (macro) : 0.9792
## F1-Score (macro) : 0.9829
## AUC-ROC (weighted): 0.9969
## AUC per kelas : 0.9974 0.9988 0.9938 0.9935
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 3
## kelas_1 0 88 1 0
## kelas_2 0 3 148 3
## kelas_3 0 0 0 132
# Ukuran model: jumlah parameter (koefisien)
ukuran_rl <- length(coef(model_rl))
cat(sprintf("\nJumlah parameter model : %d\n", ukuran_rl))##
## Jumlah parameter model : 51
## Waktu prediksi (detik) : 0.0100
mtry: jumlah fitur yang dipertimbangkan di setiap pemisahan node.
Tuning didasarkan pada AUC-ROC.
set.seed(SEED)
grid_rf <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12))
tuning_rf <- train(
Room_Occupancy_Count ~ .,
data = data_latih_s,
method = "rf",
trControl = ctrl_cv_lengkap,
tuneGrid = grid_rf,
ntree = 100,
metric = "AUC" # tuning berdasarkan AUC-ROC
)
cat("Hasil Tuning Random Forest:\n")## Hasil Tuning Random Forest:
print(tuning_rf$results[, intersect(c("mtry","AUC","ROC","Accuracy","Kappa"), names(tuning_rf$results))])## mtry AUC Accuracy Kappa
## 1 2 0.9999996 0.9995822 0.9994430
## 2 4 0.9999995 0.9996202 0.9994937
## 3 6 0.9999991 0.9995443 0.9993924
## 4 8 0.9999992 0.9995823 0.9994430
## 5 10 0.9999741 0.9995823 0.9994430
## 6 12 0.9999736 0.9993164 0.9990885
##
## mtry terbaik: 2
set.seed(SEED)
mtry_terbaik <- tuning_rf$bestTune$mtry
# ntree = 300 pada model final untuk stabilitas prediksi
waktu_latih_rf <- system.time(
model_rf <- randomForest(
Room_Occupancy_Count ~ .,
data = data_latih_s,
ntree = 300,
mtry = mtry_terbaik,
importance = TRUE
)
)
cat(sprintf("Waktu pelatihan (detik): %.2f\n", waktu_latih_rf["elapsed"]))## Waktu pelatihan (detik): 23.66
##
## Call:
## randomForest(formula = Room_Occupancy_Count ~ ., data = data_latih_s, ntree = 300, mtry = mtry_terbaik, importance = TRUE)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0.03%
## Confusion matrix:
## kelas_0 kelas_1 kelas_2 kelas_3 class.error
## kelas_0 6582 1 0 0 0.0001519064
## kelas_1 0 6581 2 0 0.0003038129
## kelas_2 0 0 6581 2 0.0003038129
## kelas_3 0 0 3 6580 0.0004557193
imp_mat <- importance(model_rf)
df_imp <- data.frame(
Fitur = rownames(imp_mat),
Pentingnya = imp_mat[, "MeanDecreaseAccuracy"]
) %>% arrange(desc(Pentingnya))
ggplot(df_imp, aes(x = reorder(Fitur, Pentingnya), y = Pentingnya,
fill = Pentingnya)) +
geom_col(color = "black", width = 0.7) +
coord_flip() +
scale_fill_gradient(low = "#A9D18E", high = "#1F4E79") +
labs(title = "Pentingnya Fitur — Random Forest",
subtitle = "Berdasarkan Mean Decrease Accuracy",
x = "Variabel Sensor", y = "Mean Decrease Accuracy") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")waktu_pred_rf <- system.time(
pred_rf <- predict(model_rf, X_uji, type = "class")
)
prob_rf <- predict(model_rf, X_uji, type = "prob")
prob_rf <- as.data.frame(prob_rf)
colnames(prob_rf) <- LEVELS
hasil_rf <- evaluasi(y_uji, pred_rf, prob_rf, "Random Forest")##
## ========== Random Forest ==========
## Balanced Accuracy : 0.9955 (99.55%)
## Presisi (macro) : 0.9927
## Recall (macro) : 0.9921
## F1-Score (macro) : 0.9924
## AUC-ROC (weighted): 1.0000
## AUC per kelas : 1 1 0.9999 1
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 1
## kelas_1 0 90 2 0
## kelas_2 0 1 147 0
## kelas_3 0 0 0 137
# Ukuran model: jumlah pohon × rata-rata node per pohon
ukuran_rf <- model_rf$ntree
cat(sprintf("\nJumlah pohon (ntree) : %d\n", ukuran_rf))##
## Jumlah pohon (ntree) : 300
## Waktu prediksi (detik) : 0.0500
tabel_hasil <- data.frame(
Model = c("Decision Tree", "Regresi Logistik", "Random Forest"),
Hyperparameter = c(
paste0("cp = ", formatC(cp_terbaik, format = "f", digits = 4)),
paste0("decay = ", formatC(decay_terbaik, format = "f", digits = 4)),
paste0("mtry = ", mtry_terbaik)
),
Balanced_Acc = round(c(hasil_dt$bal_acc, hasil_rl$bal_acc, hasil_rf$bal_acc), 4),
Presisi = round(c(hasil_dt$presisi, hasil_rl$presisi, hasil_rf$presisi), 4),
Recall = round(c(hasil_dt$recall, hasil_rl$recall, hasil_rf$recall), 4),
F1_Score = round(c(hasil_dt$f1, hasil_rl$f1, hasil_rf$f1), 4),
AUC_ROC = round(c(hasil_dt$auc, hasil_rl$auc, hasil_rf$auc), 4)
)
print(tabel_hasil)## Model Hyperparameter Balanced_Acc Presisi Recall F1_Score AUC_ROC
## 1 Decision Tree cp = 0.0001 0.9829 0.9685 0.9687 0.9686 0.9974
## 2 Regresi Logistik decay = 0.0001 0.9881 0.9870 0.9792 0.9829 0.9969
## 3 Random Forest mtry = 2 0.9955 0.9927 0.9921 0.9924 1.0000
tabel_kompleksitas <- data.frame(
Model = c("Decision Tree", "Regresi Logistik", "Random Forest"),
Waktu_Latih_det = round(c(
# Decision Tree: gunakan system.time pada rpart (sangat cepat, gunakan proc.time trick)
as.numeric(system.time(rpart(Room_Occupancy_Count ~ ., data = data_latih_s,
method = "class",
control = rpart.control(cp = cp_terbaik,
maxdepth = 5,
minsplit = 20,
minbucket = 7)))["elapsed"]),
waktu_latih_rl["elapsed"],
waktu_latih_rf["elapsed"]
), 3),
Waktu_Pred_det = round(c(
waktu_pred_dt["elapsed"],
waktu_pred_rl["elapsed"],
waktu_pred_rf["elapsed"]
), 4),
Ukuran_Model = c(
paste0(ukuran_dt, " node"),
paste0(ukuran_rl, " parameter"),
paste0(ukuran_rf, " pohon")
),
Interpretabilitas = c("Tinggi", "Sedang", "Rendah"),
Risiko_Overfit = c("Sedang", "Rendah", "Rendah")
)
print(tabel_kompleksitas)## Model Waktu_Latih_det Waktu_Pred_det Ukuran_Model
## 1 Decision Tree 1.21 0.00 43 node
## 2 Regresi Logistik 7.64 0.01 51 parameter
## 3 Random Forest 23.66 0.05 300 pohon
## Interpretabilitas Risiko_Overfit
## 1 Tinggi Sedang
## 2 Sedang Rendah
## 3 Rendah Rendah
df_plot <- melt(
tabel_hasil[, c("Model","Balanced_Acc","Presisi","Recall","F1_Score","AUC_ROC")],
id.vars = "Model", variable.name = "Metrik", value.name = "Nilai"
)
# Label yang lebih rapi
df_plot$Metrik <- recode(df_plot$Metrik,
"Balanced_Acc" = "Balanced Acc",
"AUC_ROC" = "AUC-ROC"
)
ggplot(df_plot, aes(x = Model, y = Nilai, fill = Model)) +
geom_col(color = "black", width = 0.6) +
geom_text(aes(label = sprintf("%.3f", Nilai)),
vjust = -0.4, size = 3.5, fontface = "bold") +
facet_wrap(~Metrik, ncol = 5) +
scale_fill_manual(values = c(
"Decision Tree" = "#4472C4",
"Regresi Logistik" = "#ED7D31",
"Random Forest" = "#70AD47"
)) +
scale_y_continuous(limits = c(0, 1.15),
labels = percent_format(accuracy = 1)) +
labs(title = "Perbandingan Performa Ketiga Model",
subtitle = paste0("SMOTE + Hyperparameter Tuning (5-Fold CV, metric: AUC-ROC) | Seed = ", SEED),
x = NULL, y = "Nilai Metrik", fill = "Model") +
theme_minimal(base_size = 11) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
legend.position = "bottom")df_waktu <- data.frame(
Model = rep(c("Decision Tree", "Regresi Logistik", "Random Forest"), 2),
Jenis = rep(c("Pelatihan", "Prediksi"), each = 3),
Waktu = c(
tabel_kompleksitas$Waktu_Latih_det,
tabel_kompleksitas$Waktu_Pred_det
)
)
ggplot(df_waktu, aes(x = Model, y = Waktu, fill = Model)) +
geom_col(color = "black", width = 0.6) +
geom_text(aes(label = sprintf("%.3f s", Waktu)),
vjust = -0.4, size = 3.8, fontface = "bold") +
facet_wrap(~Jenis, scales = "free_y") +
scale_fill_manual(values = c(
"Decision Tree" = "#4472C4",
"Regresi Logistik" = "#ED7D31",
"Random Forest" = "#70AD47"
)) +
labs(title = "Perbandingan Waktu Pelatihan & Prediksi",
subtitle = "Semakin rendah semakin efisien",
x = NULL, y = "Waktu (detik)", fill = "Model") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
legend.position = "bottom")terbaik_idx <- which.max(tabel_hasil$AUC_ROC)
nama_terbaik <- tabel_hasil$Model[terbaik_idx]
cm_terbaik <- switch(nama_terbaik,
"Decision Tree" = hasil_dt$cm,
"Regresi Logistik" = hasil_rl$cm,
"Random Forest" = hasil_rf$cm
)
df_cm <- as.data.frame(cm_terbaik$table)
names(df_cm) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(df_cm, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = Frekuensi), color = "black",
size = 6, fontface = "bold") +
scale_fill_gradient(low = "#EBF5FB", high = "#1F4E79") +
labs(title = paste0("Confusion Matrix — ", nama_terbaik, " (Model Terbaik)"),
subtitle = "Baris = Prediksi | Kolom = Nilai Aktual | Dipilih berdasarkan AUC-ROC",
x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Jumlah") +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold"))terbaik <- tabel_hasil %>% arrange(desc(AUC_ROC)) %>% slice(1)
cat("============================================================\n")## ============================================================
## SEED : 42
## Penanganan Imbalance : SMOTE (over_ratio=1, K=5)
## Data untuk semua model : Sama (hasil SMOTE, tanpa standarisasi)
## Evaluasi Tuning : 5-Fold CV — metric: AUC-ROC
## Metrik Evaluasi Utama : Balanced Accuracy + AUC-ROC (weighted)
## ------------------------------------------------------------
## MODEL TERBAIK : Random Forest
## Hyperparameter Terbaik : mtry = 2
cat(sprintf(" Balanced Accuracy : %.4f (%.2f%%)\n",
terbaik$Balanced_Acc, terbaik$Balanced_Acc * 100))## Balanced Accuracy : 0.9955 (99.55%)
## AUC-ROC (weighted) : 1.0000
## F1-Score (macro) : 0.9924
## ============================================================