1. Muat Library

library(rpart)        # Decision Tree
library(rpart.plot)   # Visualisasi Decision Tree
library(nnet)         # Regresi Logistik Multinomial
library(randomForest) # Random Forest
library(themis)       # SMOTE (via recipes)
library(recipes)      # Pipeline pra-pemrosesan
library(caret)        # CV, tuning, confusion matrix
library(pROC)         # AUC-ROC
library(ggplot2)      # Visualisasi
library(dplyr)        # Manipulasi data
library(reshape2)     # Melt untuk plot
library(scales)       # Format label

# install.packages(c("rpart","rpart.plot","nnet","randomForest",
#                    "themis","recipes","caret","pROC",
#                    "ggplot2","dplyr","reshape2","scales"))

2. Penentuan Seed & Load Data

Seed digunakan agar seluruh proses yang melibatkan keacakan (split data, SMOTE, cross-validation, model berbasis pohon) menghasilkan hasil yang sama setiap kali dijalankan ulang.

SEED <- 42
set.seed(SEED)
cat("Seed aktif:", SEED, "\n")

## Seed aktif: 42

df <- read.csv("Occupancy_Estimation.csv",
               stringsAsFactors = FALSE)
cat("Dataset dimuat:", nrow(df), "baris x", ncol(df), "kolom\n")

## Dataset dimuat: 10129 baris x 19 kolom

cat("Missing values :", sum(is.na(df)), "\n")

## Missing values : 0

3. Pra-pemrosesan

3.1 Hapus Kolom Waktu & Konversi Target

df_bersih <- df %>% select(-Date, -Time)

# Gunakan prefix "kelas_" agar level menjadi valid R variable name
# (caret membutuhkan level faktor yang valid sebagai nama variabel)
LEVELS <- c("kelas_0", "kelas_1", "kelas_2", "kelas_3")

df_bersih$Room_Occupancy_Count <- factor(
  paste0("kelas_", df_bersih$Room_Occupancy_Count),
  levels = LEVELS
)

cat("Fitur prediktor :", ncol(df_bersih) - 1, "\n")

## Fitur prediktor : 16

cat("Kelas target    :", levels(df_bersih$Room_Occupancy_Count), "\n")

## Kelas target    : kelas_0 kelas_1 kelas_2 kelas_3

3.2 Split Data — 80% Latih / 20% Uji

set.seed(SEED)
idx_latih  <- createDataPartition(df_bersih$Room_Occupancy_Count,
                                   p = 0.8, list = FALSE)
data_latih <- df_bersih[ idx_latih, ]
data_uji   <- df_bersih[-idx_latih, ]

# Variabel uji — digunakan oleh SEMUA model (tanpa standarisasi)
X_uji <- data_uji %>% select(-Room_Occupancy_Count)
y_uji <- data_uji$Room_Occupancy_Count

cat("Data latih :", nrow(data_latih), "observasi\n")

## Data latih : 8106 observasi

cat("Data uji   :", nrow(data_uji),   "observasi\n")

## Data uji   : 2023 observasi

cat("\nDistribusi kelas — Data Latih:\n")

## 
## Distribusi kelas — Data Latih:

print(table(data_latih$Room_Occupancy_Count))

## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    6583     368     599     556

cat("\nDistribusi kelas — Data Uji:\n")

## 
## Distribusi kelas — Data Uji:

print(table(data_uji$Room_Occupancy_Count))

## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    1645      91     149     138

3.3 Penanganan Ketidakseimbangan Kelas (SMOTE)

Mengapa SMOTE?
Kelas 0 (kosong) mencakup ~81% data. Tanpa penanganan, model cenderung mengabaikan kelas minoritas.
SMOTE membangkitkan sampel sintetis untuk kelas 1, 2, dan 3 melalui interpolasi di ruang fitur — hanya diterapkan pada data latih agar data uji tetap mencerminkan kondisi nyata.

Catatan: Seluruh model menggunakan data yang sama (hasil SMOTE, tanpa standarisasi) agar perbandingan antar model berlaku adil.

set.seed(SEED)

rec_smote <- recipe(Room_Occupancy_Count ~ ., data = data_latih) %>%
  step_smote(Room_Occupancy_Count, over_ratio = 1, neighbors = 5)

data_latih_s <- rec_smote %>%
  prep() %>%
  bake(new_data = NULL)

# Pastikan level faktor konsisten
data_latih_s$Room_Occupancy_Count <- factor(
  data_latih_s$Room_Occupancy_Count,
  levels = LEVELS
)

cat("Distribusi sebelum SMOTE:\n")

## Distribusi sebelum SMOTE:

print(table(data_latih$Room_Occupancy_Count))

## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    6583     368     599     556

cat("\nDistribusi setelah SMOTE:\n")

## 
## Distribusi setelah SMOTE:

print(table(data_latih_s$Room_Occupancy_Count))

## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    6583    6583    6583    6583

df_smote_vis <- rbind(
  data.frame(Kondisi = "Sebelum SMOTE",
             Kelas   = as.character(data_latih$Room_Occupancy_Count)),
  data.frame(Kondisi = "Setelah SMOTE",
             Kelas   = as.character(data_latih_s$Room_Occupancy_Count))
)
df_smote_vis$Kondisi <- factor(df_smote_vis$Kondisi,
                                levels = c("Sebelum SMOTE","Setelah SMOTE"))

ggplot(df_smote_vis, aes(x = Kelas, fill = Kelas)) +
  geom_bar(color = "black", width = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.4, size = 3.8, fontface = "bold") +
  facet_wrap(~Kondisi) +
  scale_fill_manual(values = c("kelas_0"="#4472C4","kelas_1"="#ED7D31",
                                "kelas_2"="#70AD47","kelas_3"="#E74C3C")) +
  labs(title    = "Distribusi Kelas Sebelum dan Setelah SMOTE",
       subtitle = "SMOTE hanya diterapkan pada data latih",
       x = "Kelas Penghuni", y = "Frekuensi", fill = "Kelas") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"), legend.position = "none")

4. Fungsi Evaluasi

# ── Fungsi hitung AUC-ROC weighted-average multiclass ──────────────────────
hitung_auc <- function(y_aktual, prob_mat) {
  # prob_mat : matrix/data.frame dengan kolom = LEVELS, baris = observasi
  n_total <- length(y_aktual)
  aucs    <- numeric(length(LEVELS))
  bobot   <- numeric(length(LEVELS))

  for (i in seq_along(LEVELS)) {
    kls        <- LEVELS[i]
    bin_aktual <- as.integer(y_aktual == kls)
    bobot[i]   <- sum(bin_aktual) / n_total

    # Lewati kelas yang tidak punya observasi positif
    if (sum(bin_aktual) == 0 || sum(bin_aktual) == n_total) {
      aucs[i] <- NA
      next
    }
    roc_obj <- roc(bin_aktual, prob_mat[, kls], quiet = TRUE)
    aucs[i] <- as.numeric(auc(roc_obj))
  }
  # weighted average (abaikan kelas NA)
  valid    <- !is.na(aucs)
  auc_w    <- sum(aucs[valid] * bobot[valid]) / sum(bobot[valid])
  return(list(auc_weighted = auc_w, auc_per_kelas = setNames(aucs, LEVELS)))
}

# ── Fungsi evaluasi utama ───────────────────────────────────────────────────
evaluasi <- function(y_aktual, y_pred, prob_mat = NULL, nama_model) {

  y_aktual <- factor(as.character(y_aktual), levels = LEVELS)
  y_pred   <- factor(as.character(y_pred),   levels = LEVELS)

  cm       <- confusionMatrix(y_pred, y_aktual)

  # Balanced Accuracy per kelas → macro-average
  bal_acc  <- mean(cm$byClass[, "Balanced Accuracy"], na.rm = TRUE)
  pre      <- mean(cm$byClass[, "Precision"],         na.rm = TRUE)
  rec      <- mean(cm$byClass[, "Recall"],            na.rm = TRUE)
  f1       <- mean(cm$byClass[, "F1"],                na.rm = TRUE)

  # AUC-ROC weighted (butuh probabilitas)
  auc_w    <- NA
  auc_per  <- NULL
  if (!is.null(prob_mat)) {
    auc_res <- hitung_auc(y_aktual, prob_mat)
    auc_w   <- auc_res$auc_weighted
    auc_per <- auc_res$auc_per_kelas
  }

  cat(sprintf("\n========== %s ==========\n", nama_model))
  cat(sprintf("  Balanced Accuracy : %.4f  (%.2f%%)\n", bal_acc, bal_acc * 100))
  cat(sprintf("  Presisi (macro)   : %.4f\n", pre))
  cat(sprintf("  Recall  (macro)   : %.4f\n", rec))
  cat(sprintf("  F1-Score (macro)  : %.4f\n", f1))
  if (!is.na(auc_w)) {
    cat(sprintf("  AUC-ROC (weighted): %.4f\n", auc_w))
    cat("  AUC per kelas     :", round(auc_per, 4), "\n")
  }
  cat("\nConfusion Matrix:\n")
  print(cm$table)

  list(cm = cm, bal_acc = bal_acc, presisi = pre,
       recall = rec, f1 = f1, auc = auc_w)
}

5. Setup Cross-Validation (untuk Hyperparameter Tuning)

5-Fold Stratified CV digunakan untuk memilih hyperparameter terbaik. Metric tuning menggunakan ROC (AUC-ROC macro) agar konsisten dengan metrik evaluasi akhir.

# Setup kontrol untuk Cross-Validation 
ctrl_cv_lengkap <- trainControl(
  method          = "cv",
  number          = 5,                 
  classProbs      = TRUE,              # Wajib agar bisa menghitung AUC
  summaryFunction = multiClassSummary, # Menghitung AUC, Accuracy, Kappa untuk multi-class
  savePredictions = "final"
)

6. Model 1 — Decision Tree

6.1 Hyperparameter Tuning

cp (complexity parameter): mengontrol ukuran pohon.
Nilai kecil → pohon lebih dalam (risiko overfit),
nilai besar → pohon lebih sederhana (risiko underfit).
Tuning didasarkan pada AUC-ROC pada 5-Fold CV.

set.seed(SEED)

grid_dt <- expand.grid(cp = c(0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05))

tuning_dt <- train(
  Room_Occupancy_Count ~ .,
  data      = data_latih_s,
  method    = "rpart",
  trControl = ctrl_cv_lengkap, # Pastikan ini pakai yang ada classProbs=T
  tuneGrid  = grid_dt,
  metric    = "AUC"  
)

cat("Hasil Tuning Decision Tree:\n")

## Hasil Tuning Decision Tree:

print(tuning_dt$results[, c("cp", "AUC", "Accuracy", "Kappa")])

##      cp       AUC  Accuracy     Kappa
## 1 1e-04 0.9988971 0.9955566 0.9940755
## 2 5e-04 0.9983245 0.9924425 0.9899234
## 3 1e-03 0.9973368 0.9887966 0.9850622
## 4 5e-03 0.9947655 0.9758086 0.9677448
## 5 1e-02 0.9901377 0.9541239 0.9388319
## 6 5e-02 0.9532082 0.8866776 0.8489036

cat("\nCP terbaik:", tuning_dt$bestTune$cp, "\n")

## 
## CP terbaik: 1e-04

6.2 Latih Model Final (dengan Pruning)

Pruning dilakukan dengan kombinasi: - Pre-pruning: maxdepth = 5, minsplit = 20, minbucket = 7 untuk mencegah pohon tumbuh terlalu dalam. - Post-pruning: prune() menggunakan cp terbaik dari CV untuk memangkas cabang yang tidak signifikan.

set.seed(SEED)
cp_terbaik <- tuning_dt$bestTune$cp

# Latih dengan pre-pruning
model_dt_full <- rpart(
  Room_Occupancy_Count ~ .,
  data    = data_latih_s,
  method  = "class",
  control = rpart.control(
    cp        = 0.00001,   # biarkan tumbuh dulu, lalu post-prune
    maxdepth  = 5,         # pre-pruning: batas kedalaman
    minsplit  = 20,        # pre-pruning: min obs untuk split
    minbucket = 7          # pre-pruning: min obs di daun
  )
)

# Post-pruning: pangkas dengan cp terbaik dari CV
model_dt <- prune(model_dt_full, cp = cp_terbaik)

cat("Decision Tree dilatih & dipangkas dengan cp =", cp_terbaik, "\n")

## Decision Tree dilatih & dipangkas dengan cp = 1e-04

cat("Jumlah node setelah pruning:", nrow(model_dt$frame), "\n")

## Jumlah node setelah pruning: 43

printcp(model_dt)

## 
## Classification tree:
## rpart(formula = Room_Occupancy_Count ~ ., data = data_latih_s, 
##     method = "class", control = rpart.control(cp = 1e-05, maxdepth = 5, 
##         minsplit = 20, minbucket = 7))
## 
## Variables actually used in tree construction:
##  [1] S1_Light     S1_Sound     S1_Temp      S2_Light     S2_Temp     
##  [6] S3_Light     S3_Sound     S3_Temp      S5_CO2       S5_CO2_Slope
## [11] S7_PIR      
## 
## Root node error: 19749/26332 = 0.75
## 
## n= 26332 
## 
##            CP nsplit rel error   xerror      xstd
## 1  0.31297787      0  1.000000 1.008861 0.0035258
## 2  0.24765811      1  0.687022 0.688946 0.0041060
## 3  0.12658869      2  0.439364 0.439465 0.0038624
## 4  0.08309281      3  0.312775 0.313889 0.0034860
## 5  0.07904198      4  0.229683 0.230796 0.0031086
## 6  0.03691326      5  0.150641 0.150843 0.0026027
## 7  0.02724189      6  0.113727 0.113930 0.0022969
## 8  0.01316522      7  0.086485 0.086688 0.0020259
## 9  0.01235506      8  0.073320 0.076966 0.0019163
## 10 0.00886121      9  0.060965 0.061168 0.0017191
## 11 0.00825358     10  0.052104 0.054079 0.0016209
## 12 0.00693706     11  0.043850 0.044610 0.0014776
## 13 0.00450656     12  0.036913 0.038280 0.0013721
## 14 0.00151906     13  0.032407 0.033774 0.0012911
## 15 0.00136716     14  0.030888 0.032154 0.0012605
## 16 0.00111398     15  0.029520 0.030533 0.0012291
## 17 0.00070890     16  0.028407 0.029622 0.0012110
## 18 0.00050635     17  0.027698 0.028862 0.0011957
## 19 0.00025318     18  0.027191 0.028660 0.0011916
## 20 0.00015191     19  0.026938 0.028660 0.0011916
## 21 0.00001000     21  0.026634 0.028204 0.0011823

6.3 Visualisasi Pohon Keputusan (Setelah Pruning)

library(rpart.plot)

rpart.plot(
  tuning_dt$finalModel, # GANTI model_dt MENJADI tuning_dt$finalModel
  type         = 2,
  extra        = 104,
  fallen.leaves = TRUE,
  main         = "Decision Tree (Pruned) - Estimasi Penghuni Ruangan",
  box.palette  = list("#4472C4", "#ED7D31", "#70AD47", "#FF0000"),
  shadow.col   = "gray",
  cex          = 0.75,
  tweak        = 1.1
)

6.4 Evaluasi Decision Tree

# Waktu prediksi
waktu_pred_dt <- system.time(
  pred_dt <- predict(model_dt, X_uji, type = "class")
)

# Probabilitas untuk AUC-ROC
prob_dt  <- predict(model_dt, X_uji, type = "prob")
prob_dt  <- as.data.frame(prob_dt)
colnames(prob_dt) <- LEVELS

hasil_dt <- evaluasi(y_uji, pred_dt, prob_dt, "Decision Tree")

## 
## ========== Decision Tree ==========
##   Balanced Accuracy : 0.9829  (98.29%)
##   Presisi (macro)   : 0.9685
##   Recall  (macro)   : 0.9687
##   F1-Score (macro)  : 0.9686
##   AUC-ROC (weighted): 0.9974
##   AUC per kelas     : 0.9985 0.993 0.9908 0.9937 
## 
## Confusion Matrix:
##           Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
##    kelas_0    1643       0       0       1
##    kelas_1       0      88       2       0
##    kelas_2       1       2     143       6
##    kelas_3       1       1       4     131

# Ukuran model: jumlah node
ukuran_dt <- nrow(model_dt$frame)
cat(sprintf("\nJumlah node Decision Tree : %d\n", ukuran_dt))

## 
## Jumlah node Decision Tree : 43

cat(sprintf("Waktu prediksi (detik)    : %.4f\n", waktu_pred_dt["elapsed"]))

## Waktu prediksi (detik)    : 0.0000

7. Model 2 — Regresi Logistik Multinomial

Catatan: Regresi Logistik menggunakan data yang sama dengan model lain (hasil SMOTE, tanpa standarisasi) agar perbandingan antar model berlaku adil.

7.1 Hyperparameter Tuning

decay (regularisasi L2): mencegah overfitting dengan memberikan penalti pada koefisien yang besar. Tuning didasarkan pada AUC-ROC.

set.seed(SEED)

grid_rl   <- expand.grid(decay = c(0.0001, 0.001, 0.01, 0.05, 0.1, 0.5))

tuning_rl <- train(
  Room_Occupancy_Count ~ .,
  data      = data_latih_s,     # data yang sama dengan model lain (adil)
  method    = "multinom",
  trControl = ctrl_cv_lengkap,
  tuneGrid  = grid_rl,
  maxit     = 200,
  MaxNWts   = 5000,
  trace     = FALSE,
  metric    = "AUC"             # tuning berdasarkan AUC-ROC
)

cat("Hasil Tuning Regresi Logistik:\n")

## Hasil Tuning Regresi Logistik:

print(tuning_rl$results[, intersect(c("decay","AUC","ROC","Accuracy","Kappa"), names(tuning_rl$results))])

##   decay       AUC  Accuracy     Kappa
## 1 1e-04 0.9996347 0.9898602 0.9864803
## 2 1e-03 0.9996208 0.9897843 0.9863790
## 3 1e-02 0.9995858 0.9897842 0.9863789
## 4 5e-02 0.9995358 0.9888728 0.9851637
## 5 1e-01 0.9995014 0.9884170 0.9845560
## 6 5e-01 0.9993671 0.9872018 0.9829357

cat("\nDecay terbaik:", tuning_rl$bestTune$decay, "\n")

## 
## Decay terbaik: 1e-04

plot(tuning_rl, main = "Tuning Regresi Logistik — AUC-ROC vs Decay")

7.2 Latih Model Final

set.seed(SEED)
decay_terbaik <- tuning_rl$bestTune$decay

waktu_latih_rl <- system.time(
  suppressMessages(
    model_rl <- multinom(
      Room_Occupancy_Count ~ .,
      data    = data_latih_s,   # data yang sama (tanpa standarisasi)
      MaxNWts = 5000,
      maxit   = 200,
      trace   = FALSE,
      decay   = decay_terbaik
    )
  )
)

cat("Regresi Logistik dilatih dengan decay =", decay_terbaik, "\n")

## Regresi Logistik dilatih dengan decay = 1e-04

cat(sprintf("Waktu pelatihan (detik): %.2f\n", waktu_latih_rl["elapsed"]))

## Waktu pelatihan (detik): 7.64

7.3 Evaluasi Regresi Logistik

waktu_pred_rl <- system.time(
  pred_rl <- predict(model_rl, X_uji, type = "class")
)

prob_rl  <- predict(model_rl, X_uji, type = "probs")
prob_rl  <- as.data.frame(prob_rl)
colnames(prob_rl) <- LEVELS

hasil_rl <- evaluasi(y_uji, pred_rl, prob_rl, "Regresi Logistik")

## 
## ========== Regresi Logistik ==========
##   Balanced Accuracy : 0.9881  (98.81%)
##   Presisi (macro)   : 0.9870
##   Recall  (macro)   : 0.9792
##   F1-Score (macro)  : 0.9829
##   AUC-ROC (weighted): 0.9969
##   AUC per kelas     : 0.9974 0.9988 0.9938 0.9935 
## 
## Confusion Matrix:
##           Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
##    kelas_0    1645       0       0       3
##    kelas_1       0      88       1       0
##    kelas_2       0       3     148       3
##    kelas_3       0       0       0     132

# Ukuran model: jumlah parameter (koefisien)
ukuran_rl <- length(coef(model_rl))
cat(sprintf("\nJumlah parameter model    : %d\n", ukuran_rl))

## 
## Jumlah parameter model    : 51

cat(sprintf("Waktu prediksi (detik)    : %.4f\n", waktu_pred_rl["elapsed"]))

## Waktu prediksi (detik)    : 0.0100

8. Model 3 — Random Forest

8.1 Hyperparameter Tuning

mtry: jumlah fitur yang dipertimbangkan di setiap pemisahan node.
Tuning didasarkan pada AUC-ROC.

set.seed(SEED)

grid_rf   <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12))

tuning_rf <- train(
  Room_Occupancy_Count ~ .,
  data      = data_latih_s,
  method    = "rf",
  trControl = ctrl_cv_lengkap,
  tuneGrid  = grid_rf,
  ntree     = 100,
  metric    = "AUC"             # tuning berdasarkan AUC-ROC
)

cat("Hasil Tuning Random Forest:\n")

## Hasil Tuning Random Forest:

print(tuning_rf$results[, intersect(c("mtry","AUC","ROC","Accuracy","Kappa"), names(tuning_rf$results))])

##   mtry       AUC  Accuracy     Kappa
## 1    2 0.9999996 0.9995822 0.9994430
## 2    4 0.9999995 0.9996202 0.9994937
## 3    6 0.9999991 0.9995443 0.9993924
## 4    8 0.9999992 0.9995823 0.9994430
## 5   10 0.9999741 0.9995823 0.9994430
## 6   12 0.9999736 0.9993164 0.9990885

cat("\nmtry terbaik:", tuning_rf$bestTune$mtry, "\n")

## 
## mtry terbaik: 2

plot(tuning_rf, main = "Tuning Random Forest — AUC-ROC vs mtry")

8.2 Latih Model Final

set.seed(SEED)
mtry_terbaik <- tuning_rf$bestTune$mtry

# ntree = 300 pada model final untuk stabilitas prediksi
waktu_latih_rf <- system.time(
  model_rf <- randomForest(
    Room_Occupancy_Count ~ .,
    data       = data_latih_s,
    ntree      = 300,
    mtry       = mtry_terbaik,
    importance = TRUE
  )
)

cat(sprintf("Waktu pelatihan (detik): %.2f\n", waktu_latih_rf["elapsed"]))

## Waktu pelatihan (detik): 23.66

print(model_rf)

## 
## Call:
##  randomForest(formula = Room_Occupancy_Count ~ ., data = data_latih_s,      ntree = 300, mtry = mtry_terbaik, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 0.03%
## Confusion matrix:
##         kelas_0 kelas_1 kelas_2 kelas_3  class.error
## kelas_0    6582       1       0       0 0.0001519064
## kelas_1       0    6581       2       0 0.0003038129
## kelas_2       0       0    6581       2 0.0003038129
## kelas_3       0       0       3    6580 0.0004557193

8.3 Feature Importance

imp_mat <- importance(model_rf)
df_imp  <- data.frame(
  Fitur      = rownames(imp_mat),
  Pentingnya = imp_mat[, "MeanDecreaseAccuracy"]
) %>% arrange(desc(Pentingnya))

ggplot(df_imp, aes(x = reorder(Fitur, Pentingnya), y = Pentingnya,
                    fill = Pentingnya)) +
  geom_col(color = "black", width = 0.7) +
  coord_flip() +
  scale_fill_gradient(low = "#A9D18E", high = "#1F4E79") +
  labs(title    = "Pentingnya Fitur — Random Forest",
       subtitle = "Berdasarkan Mean Decrease Accuracy",
       x = "Variabel Sensor", y = "Mean Decrease Accuracy") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"), legend.position = "none")

8.4 Evaluasi Random Forest

waktu_pred_rf <- system.time(
  pred_rf <- predict(model_rf, X_uji, type = "class")
)

prob_rf  <- predict(model_rf, X_uji, type = "prob")
prob_rf  <- as.data.frame(prob_rf)
colnames(prob_rf) <- LEVELS

hasil_rf <- evaluasi(y_uji, pred_rf, prob_rf, "Random Forest")

## 
## ========== Random Forest ==========
##   Balanced Accuracy : 0.9955  (99.55%)
##   Presisi (macro)   : 0.9927
##   Recall  (macro)   : 0.9921
##   F1-Score (macro)  : 0.9924
##   AUC-ROC (weighted): 1.0000
##   AUC per kelas     : 1 1 0.9999 1 
## 
## Confusion Matrix:
##           Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
##    kelas_0    1645       0       0       1
##    kelas_1       0      90       2       0
##    kelas_2       0       1     147       0
##    kelas_3       0       0       0     137

# Ukuran model: jumlah pohon × rata-rata node per pohon
ukuran_rf <- model_rf$ntree
cat(sprintf("\nJumlah pohon (ntree)      : %d\n", ukuran_rf))

## 
## Jumlah pohon (ntree)      : 300

cat(sprintf("Waktu prediksi (detik)    : %.4f\n", waktu_pred_rf["elapsed"]))

## Waktu prediksi (detik)    : 0.0500

9. Perbandingan Ketiga Model

9.1 Tabel Perbandingan — Performa

tabel_hasil <- data.frame(
  Model          = c("Decision Tree", "Regresi Logistik", "Random Forest"),
  Hyperparameter = c(
    paste0("cp = ",    formatC(cp_terbaik,    format = "f", digits = 4)),
    paste0("decay = ", formatC(decay_terbaik, format = "f", digits = 4)),
    paste0("mtry = ",  mtry_terbaik)
  ),
  Balanced_Acc = round(c(hasil_dt$bal_acc, hasil_rl$bal_acc, hasil_rf$bal_acc), 4),
  Presisi      = round(c(hasil_dt$presisi, hasil_rl$presisi, hasil_rf$presisi), 4),
  Recall       = round(c(hasil_dt$recall,  hasil_rl$recall,  hasil_rf$recall),  4),
  F1_Score     = round(c(hasil_dt$f1,      hasil_rl$f1,      hasil_rf$f1),      4),
  AUC_ROC      = round(c(hasil_dt$auc,     hasil_rl$auc,     hasil_rf$auc),     4)
)

print(tabel_hasil)

##              Model Hyperparameter Balanced_Acc Presisi Recall F1_Score AUC_ROC
## 1    Decision Tree    cp = 0.0001       0.9829  0.9685 0.9687   0.9686  0.9974
## 2 Regresi Logistik decay = 0.0001       0.9881  0.9870 0.9792   0.9829  0.9969
## 3    Random Forest       mtry = 2       0.9955  0.9927 0.9921   0.9924  1.0000

9.2 Tabel Perbandingan — Kompleksitas Model

tabel_kompleksitas <- data.frame(
  Model             = c("Decision Tree", "Regresi Logistik", "Random Forest"),
  Waktu_Latih_det   = round(c(
    # Decision Tree: gunakan system.time pada rpart (sangat cepat, gunakan proc.time trick)
    as.numeric(system.time(rpart(Room_Occupancy_Count ~ ., data = data_latih_s,
                                  method = "class",
                                  control = rpart.control(cp = cp_terbaik,
                                                          maxdepth = 5,
                                                          minsplit = 20,
                                                          minbucket = 7)))["elapsed"]),
    waktu_latih_rl["elapsed"],
    waktu_latih_rf["elapsed"]
  ), 3),
  Waktu_Pred_det    = round(c(
    waktu_pred_dt["elapsed"],
    waktu_pred_rl["elapsed"],
    waktu_pred_rf["elapsed"]
  ), 4),
  Ukuran_Model      = c(
    paste0(ukuran_dt, " node"),
    paste0(ukuran_rl, " parameter"),
    paste0(ukuran_rf, " pohon")
  ),
  Interpretabilitas = c("Tinggi", "Sedang", "Rendah"),
  Risiko_Overfit    = c("Sedang", "Rendah", "Rendah")
)

print(tabel_kompleksitas)

##              Model Waktu_Latih_det Waktu_Pred_det Ukuran_Model
## 1    Decision Tree            1.21           0.00      43 node
## 2 Regresi Logistik            7.64           0.01 51 parameter
## 3    Random Forest           23.66           0.05    300 pohon
##   Interpretabilitas Risiko_Overfit
## 1            Tinggi         Sedang
## 2            Sedang         Rendah
## 3            Rendah         Rendah

9.3 Grafik Perbandingan Performa

df_plot <- melt(
  tabel_hasil[, c("Model","Balanced_Acc","Presisi","Recall","F1_Score","AUC_ROC")],
  id.vars = "Model", variable.name = "Metrik", value.name = "Nilai"
)

# Label yang lebih rapi
df_plot$Metrik <- recode(df_plot$Metrik,
  "Balanced_Acc" = "Balanced Acc",
  "AUC_ROC"      = "AUC-ROC"
)

ggplot(df_plot, aes(x = Model, y = Nilai, fill = Model)) +
  geom_col(color = "black", width = 0.6) +
  geom_text(aes(label = sprintf("%.3f", Nilai)),
            vjust = -0.4, size = 3.5, fontface = "bold") +
  facet_wrap(~Metrik, ncol = 5) +
  scale_fill_manual(values = c(
    "Decision Tree"    = "#4472C4",
    "Regresi Logistik" = "#ED7D31",
    "Random Forest"    = "#70AD47"
  )) +
  scale_y_continuous(limits = c(0, 1.15),
                     labels = percent_format(accuracy = 1)) +
  labs(title    = "Perbandingan Performa Ketiga Model",
       subtitle = paste0("SMOTE + Hyperparameter Tuning (5-Fold CV, metric: AUC-ROC) | Seed = ", SEED),
       x = NULL, y = "Nilai Metrik", fill = "Model") +
  theme_minimal(base_size = 11) +
  theme(axis.text.x    = element_blank(),
        axis.ticks.x   = element_blank(),
        strip.text     = element_text(face = "bold"),
        plot.title     = element_text(face = "bold"),
        legend.position = "bottom")

9.4 Grafik Perbandingan Kompleksitas

df_waktu <- data.frame(
  Model    = rep(c("Decision Tree", "Regresi Logistik", "Random Forest"), 2),
  Jenis    = rep(c("Pelatihan", "Prediksi"), each = 3),
  Waktu    = c(
    tabel_kompleksitas$Waktu_Latih_det,
    tabel_kompleksitas$Waktu_Pred_det
  )
)

ggplot(df_waktu, aes(x = Model, y = Waktu, fill = Model)) +
  geom_col(color = "black", width = 0.6) +
  geom_text(aes(label = sprintf("%.3f s", Waktu)),
            vjust = -0.4, size = 3.8, fontface = "bold") +
  facet_wrap(~Jenis, scales = "free_y") +
  scale_fill_manual(values = c(
    "Decision Tree"    = "#4472C4",
    "Regresi Logistik" = "#ED7D31",
    "Random Forest"    = "#70AD47"
  )) +
  labs(title    = "Perbandingan Waktu Pelatihan & Prediksi",
       subtitle = "Semakin rendah semakin efisien",
       x = NULL, y = "Waktu (detik)", fill = "Model") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x    = element_blank(),
        axis.ticks.x   = element_blank(),
        strip.text     = element_text(face = "bold"),
        plot.title     = element_text(face = "bold"),
        legend.position = "bottom")

9.5 Confusion Matrix Visual — Model Terbaik

terbaik_idx  <- which.max(tabel_hasil$AUC_ROC)
nama_terbaik <- tabel_hasil$Model[terbaik_idx]

cm_terbaik <- switch(nama_terbaik,
  "Decision Tree"    = hasil_dt$cm,
  "Regresi Logistik" = hasil_rl$cm,
  "Random Forest"    = hasil_rf$cm
)

df_cm <- as.data.frame(cm_terbaik$table)
names(df_cm) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(df_cm, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(color = "white", linewidth = 1.2) +
  geom_text(aes(label = Frekuensi), color = "black",
            size = 6, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#1F4E79") +
  labs(title    = paste0("Confusion Matrix — ", nama_terbaik, " (Model Terbaik)"),
       subtitle = "Baris = Prediksi | Kolom = Nilai Aktual | Dipilih berdasarkan AUC-ROC",
       x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Jumlah") +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold"))

10. Ringkasan Akhir

terbaik <- tabel_hasil %>% arrange(desc(AUC_ROC)) %>% slice(1)

cat("============================================================\n")

## ============================================================

cat(sprintf("  SEED                   : %d\n", SEED))

##   SEED                   : 42

cat(sprintf("  Penanganan Imbalance   : SMOTE (over_ratio=1, K=5)\n"))

##   Penanganan Imbalance   : SMOTE (over_ratio=1, K=5)

cat(sprintf("  Data untuk semua model : Sama (hasil SMOTE, tanpa standarisasi)\n"))

##   Data untuk semua model : Sama (hasil SMOTE, tanpa standarisasi)

cat(sprintf("  Evaluasi Tuning        : 5-Fold CV — metric: AUC-ROC\n"))

##   Evaluasi Tuning        : 5-Fold CV — metric: AUC-ROC

cat(sprintf("  Metrik Evaluasi Utama  : Balanced Accuracy + AUC-ROC (weighted)\n"))

##   Metrik Evaluasi Utama  : Balanced Accuracy + AUC-ROC (weighted)

cat("------------------------------------------------------------\n")

## ------------------------------------------------------------

cat(sprintf("  MODEL TERBAIK          : %s\n",   terbaik$Model))

##   MODEL TERBAIK          : Random Forest

cat(sprintf("  Hyperparameter Terbaik : %s\n",   terbaik$Hyperparameter))

##   Hyperparameter Terbaik : mtry = 2

cat(sprintf("  Balanced Accuracy      : %.4f (%.2f%%)\n",
            terbaik$Balanced_Acc, terbaik$Balanced_Acc * 100))

##   Balanced Accuracy      : 0.9955 (99.55%)

cat(sprintf("  AUC-ROC (weighted)     : %.4f\n", terbaik$AUC_ROC))

##   AUC-ROC (weighted)     : 1.0000

cat(sprintf("  F1-Score (macro)       : %.4f\n", terbaik$F1_Score))

##   F1-Score (macro)       : 0.9924

cat("============================================================\n")

## ============================================================

Occupancy Prediction — Pra-pemrosesan & Pemodelan

Decision Tree | Regresi Logistik | Random Forest

Kelompok G

2026-04-25

1. Muat Library

2. Penentuan Seed & Load Data

3. Pra-pemrosesan

3.1 Hapus Kolom Waktu & Konversi Target

3.2 Split Data — 80% Latih / 20% Uji

3.3 Penanganan Ketidakseimbangan Kelas (SMOTE)

4. Fungsi Evaluasi

5. Setup Cross-Validation (untuk Hyperparameter Tuning)

6. Model 1 — Decision Tree

6.1 Hyperparameter Tuning

6.2 Latih Model Final (dengan Pruning)

6.3 Visualisasi Pohon Keputusan (Setelah Pruning)

6.4 Evaluasi Decision Tree

7. Model 2 — Regresi Logistik Multinomial

7.1 Hyperparameter Tuning

7.2 Latih Model Final

7.3 Evaluasi Regresi Logistik

8. Model 3 — Random Forest

8.1 Hyperparameter Tuning

8.2 Latih Model Final

8.3 Feature Importance

8.4 Evaluasi Random Forest

9. Perbandingan Ketiga Model

9.1 Tabel Perbandingan — Performa

9.2 Tabel Perbandingan — Kompleksitas Model

9.3 Grafik Perbandingan Performa

9.4 Grafik Perbandingan Kompleksitas

9.5 Confusion Matrix Visual — Model Terbaik

10. Ringkasan Akhir