1. Muat Library

library(rpart)        # Decision Tree
library(rpart.plot)   # Visualisasi Decision Tree
library(nnet)         # Regresi Logistik Multinomial
library(randomForest) # Random Forest
library(smotefamily)  # SMOTE
library(caret)        # CV, tuning, confusion matrix
library(pROC)         # AUC-ROC
library(ggplot2)      # Visualisasi
library(dplyr)        # Manipulasi data
library(reshape2)     # Melt untuk plot
library(scales)       # Format label
# Hapus:
library(smotefamily)
# Tambah:
library(themis)
library(recipes)

# install.packages(c("rpart","rpart.plot","nnet","randomForest",
#                    "smotefamily","caret","pROC",
#                    "ggplot2","dplyr","reshape2","scales"))

2. Penentuan Seed & Load Data

Seed digunakan agar seluruh proses yang melibatkan keacakan (split data, SMOTE, cross-validation, model berbasis pohon) menghasilkan hasil yang sama setiap kali dijalankan ulang.

SEED <- 42
set.seed(SEED)
cat("Seed aktif:", SEED, "\n")

## Seed aktif: 42

df <- read.csv("Occupancy_Estimation.csv",
               stringsAsFactors = FALSE)
cat("Dataset dimuat:", nrow(df), "baris x", ncol(df), "kolom\n")

## Dataset dimuat: 10129 baris x 19 kolom

3. Pra-pemrosesan

3.1 Hapus Kolom Waktu & Konversi Target

df_bersih <- df %>% select(-Date, -Time)

# Gunakan prefix "kelas_" agar level menjadi valid R variable name
# (caret membutuhkan level faktor yang valid sebagai nama variabel)
LEVELS <- c("kelas_0", "kelas_1", "kelas_2", "kelas_3")

df_bersih$Room_Occupancy_Count <- factor(
  paste0("kelas_", df_bersih$Room_Occupancy_Count),
  levels = LEVELS
)

cat("Fitur prediktor :", ncol(df_bersih) - 1, "\n")

## Fitur prediktor : 16

cat("Kelas target    :", levels(df_bersih$Room_Occupancy_Count), "\n")

## Kelas target    : kelas_0 kelas_1 kelas_2 kelas_3

3.2 Split Data — 80% Latih / 20% Uji

set.seed(SEED)
idx_latih  <- createDataPartition(df_bersih$Room_Occupancy_Count,
                                   p = 0.8, list = FALSE)
data_latih <- df_bersih[ idx_latih, ]
data_uji   <- df_bersih[-idx_latih, ]

# Variabel uji — digunakan oleh SEMUA model
X_uji <- data_uji %>% select(-Room_Occupancy_Count)
y_uji <- data_uji$Room_Occupancy_Count   # faktor dengan level "kelas_0","kelas_1","kelas_2","kelas_3"

cat("Data latih :", nrow(data_latih), "observasi\n")

## Data latih : 8106 observasi

cat("Data uji   :", nrow(data_uji),   "observasi\n")

## Data uji   : 2023 observasi

cat("\nDistribusi kelas — Data Latih:\n")

## 
## Distribusi kelas — Data Latih:

print(table(data_latih$Room_Occupancy_Count))

## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    6583     368     599     556

cat("\nDistribusi kelas — Data Uji:\n")

## 
## Distribusi kelas — Data Uji:

print(table(data_uji$Room_Occupancy_Count))

## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    1645      91     149     138

3.3 Penanganan Ketidakseimbangan Kelas (SMOTE)

Mengapa SMOTE?
Kelas 0 (kosong) mencakup ~81% data. Tanpa penanganan, model cenderung mengabaikan kelas minoritas.
SMOTE membangkitkan sampel sintetis untuk kelas 1, 2, dan 3 melalui interpolasi di ruang fitur — hanya diterapkan pada data latih agar data uji tetap mencerminkan kondisi nyata.

set.seed(SEED)

rec_smote <- recipe(Room_Occupancy_Count ~ ., data = data_latih) %>%
  step_smote(Room_Occupancy_Count, over_ratio = 1, neighbors = 5, seed = SEED)

data_latih_s <- rec_smote %>%
  prep() %>%
  bake(new_data = NULL)

# Pastikan level faktor konsisten
data_latih_s$Room_Occupancy_Count <- factor(
  data_latih_s$Room_Occupancy_Count,
  levels = LEVELS
)

cat("Distribusi sebelum SMOTE:\n")

## Distribusi sebelum SMOTE:

print(table(data_latih$Room_Occupancy_Count))

## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    6583     368     599     556

cat("\nDistribusi setelah SMOTE:\n")

## 
## Distribusi setelah SMOTE:

print(table(data_latih_s$Room_Occupancy_Count))

## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    6583    6583    6583    6583

df_smote_vis <- rbind(
  data.frame(Kondisi = "Sebelum SMOTE",
             Kelas   = as.character(data_latih$Room_Occupancy_Count)),
  data.frame(Kondisi = "Setelah SMOTE",
             Kelas   = as.character(data_latih_s$Room_Occupancy_Count))
)
df_smote_vis$Kondisi <- factor(df_smote_vis$Kondisi,
                                levels = c("Sebelum SMOTE","Setelah SMOTE"))

ggplot(df_smote_vis, aes(x = Kelas, fill = Kelas)) +
  geom_bar(color = "black", width = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.4, size = 3.8, fontface = "bold") +
  facet_wrap(~Kondisi) +
  scale_fill_manual(values = c("kelas_0"="#4472C4","kelas_1"="#ED7D31","kelas_2"="#70AD47","kelas_3"="#E74C3C")) +
  labs(title    = "Distribusi Kelas Sebelum dan Setelah SMOTE",
       subtitle = "SMOTE hanya diterapkan pada data latih",
       x = "Kelas Penghuni", y = "Frekuensi", fill = "Kelas") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"), legend.position = "none")

3.4 Standardisasi (khusus Regresi Logistik)

# Fitur kontinu yang akan distandarisasi
fitur_std <- setdiff(names(data_latih_s),
                      c("Room_Occupancy_Count", "S6_PIR", "S7_PIR"))

# Hitung mean & sd DARI DATA LATIH saja
mu_lat <- colMeans(data_latih_s[, fitur_std])
sd_lat <- apply(data_latih_s[, fitur_std], 2, sd)
sd_lat[sd_lat == 0] <- 1   # hindari bagi nol

# Terapkan ke data latih
data_latih_std <- data_latih_s
data_latih_std[, fitur_std] <- scale(data_latih_s[, fitur_std],
                                      center = mu_lat, scale = sd_lat)

# Terapkan ke data uji (pakai mu & sd dari latih, bukan dari uji!)
X_uji_std <- X_uji
X_uji_std[, fitur_std] <- scale(X_uji[, fitur_std],
                                  center = mu_lat, scale = sd_lat)

cat("Standardisasi selesai.\n")

## Standardisasi selesai.

cat("Contoh mean fitur latih (5 pertama):\n")

## Contoh mean fitur latih (5 pertama):

print(round(head(mu_lat, 5), 3))

##  S1_Temp  S2_Temp  S3_Temp  S4_Temp S1_Light 
##   25.788   26.039   25.415   26.031   93.695

4. Fungsi Evaluasi

# Fungsi terpusat untuk menghitung & menampilkan semua metrik
# Selalu memastikan level prediksi = level aktual = LEVELS
evaluasi <- function(y_aktual, y_pred, nama_model) {

  # Paksa kedua vektor menjadi faktor dengan level yang sama persis
  y_aktual <- factor(as.character(y_aktual), levels = LEVELS)
  y_pred   <- factor(as.character(y_pred),   levels = LEVELS)

  cm  <- confusionMatrix(y_pred, y_aktual)
  acc <- as.numeric(cm$overall["Accuracy"])
  pre <- mean(cm$byClass[, "Precision"], na.rm = TRUE)
  rec <- mean(cm$byClass[, "Recall"],    na.rm = TRUE)
  f1  <- mean(cm$byClass[, "F1"],        na.rm = TRUE)

  cat(sprintf("\n========== %s ==========\n", nama_model))
  cat(sprintf("  Akurasi  : %.4f  (%.2f%%)\n", acc, acc * 100))
  cat(sprintf("  Presisi  : %.4f\n", pre))
  cat(sprintf("  Recall   : %.4f\n", rec))
  cat(sprintf("  F1-Score : %.4f\n", f1))
  cat("\nConfusion Matrix:\n")
  print(cm$table)

  # Kembalikan list metrik untuk tabel perbandingan
  list(cm = cm, akurasi = acc, presisi = pre, recall = rec, f1 = f1)
}

5. Setup Cross-Validation (untuk Hyperparameter Tuning)

5-Fold Stratified CV digunakan untuk memilih hyperparameter terbaik. Stratified memastikan setiap fold memiliki proporsi kelas yang representatif.

set.seed(SEED)

ctrl_cv <- trainControl(
  method          = "cv",
  number          = 5,
  classProbs      = TRUE,
  summaryFunction = multiClassSummary,
  savePredictions = "final",
  verboseIter     = FALSE
)

cat("Cross-validation: 5-Fold Stratified\n")

## Cross-validation: 5-Fold Stratified

cat("Metric tuning   : Akurasi\n")

## Metric tuning   : Akurasi

6. Model 1 — Decision Tree

6.1 Hyperparameter Tuning

cp (complexity parameter): mengontrol ukuran pohon.
Nilai kecil → pohon lebih dalam (risiko overfit),
nilai besar → pohon lebih sederhana (risiko underfit).

set.seed(SEED)

grid_dt   <- expand.grid(cp = c(0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05))

tuning_dt <- train(
  Room_Occupancy_Count ~ .,
  data      = data_latih_s,   # data setelah SMOTE, level "kelas_0"–"kelas_3"
  method    = "rpart",
  trControl = ctrl_cv,
  tuneGrid  = grid_dt,
  metric    = "Accuracy"
)

cat("Hasil Tuning Decision Tree:\n")

## Hasil Tuning Decision Tree:

print(tuning_dt$results[, c("cp", "Accuracy", "Kappa")])

##      cp  Accuracy     Kappa
## 1 1e-04 0.9958985 0.9945313
## 2 5e-04 0.9933539 0.9911385
## 3 1e-03 0.9879231 0.9838975
## 4 5e-03 0.9761124 0.9681499
## 5 1e-02 0.9544280 0.9392373
## 6 5e-02 0.8867921 0.8490561

cat("\nCP terbaik:", tuning_dt$bestTune$cp, "\n")

## 
## CP terbaik: 1e-04

plot(tuning_dt, main = "Tuning Decision Tree — Akurasi vs CP")

6.2 Latih Model Final

set.seed(SEED)
cp_terbaik <- tuning_dt$bestTune$cp

model_dt <- rpart(
  Room_Occupancy_Count ~ .,
  data    = data_latih_s,
  method  = "class",
  control = rpart.control(cp = cp_terbaik, maxdepth = 15, minsplit = 10)
)

cat("Decision Tree dilatih dengan cp =", cp_terbaik, "\n")

## Decision Tree dilatih dengan cp = 1e-04

printcp(model_dt)

## 
## Classification tree:
## rpart(formula = Room_Occupancy_Count ~ ., data = data_latih_s, 
##     method = "class", control = rpart.control(cp = cp_terbaik, 
##         maxdepth = 15, minsplit = 10))
## 
## Variables actually used in tree construction:
##  [1] S1_Light     S1_Sound     S1_Temp      S2_Light     S2_Sound    
##  [6] S2_Temp      S3_Light     S3_Sound     S3_Temp      S4_Light    
## [11] S4_Sound     S5_CO2       S5_CO2_Slope S7_PIR      
## 
## Root node error: 19749/26332 = 0.75
## 
## n= 26332 
## 
##            CP nsplit rel error    xerror       xstd
## 1  0.31368677      0 1.0000000 1.0088612 0.00352584
## 2  0.24654413      1 0.6863132 0.6863132 0.00410656
## 3  0.12679123      2 0.4397691 0.4403261 0.00386432
## 4  0.08344726      3 0.3129779 0.3145476 0.00348853
## 5  0.07909261      4 0.2295306 0.2311003 0.00311025
## 6  0.03772343      5 0.1504380 0.1511469 0.00260496
## 7  0.02729252      6 0.1127146 0.1133222 0.00229138
## 8  0.01316522      7 0.0854220 0.0859790 0.00201813
## 9  0.01184870      8 0.0722568 0.0714467 0.00185037
## 10 0.00911439      9 0.0604081 0.0606613 0.00171227
## 11 0.00886121     10 0.0512937 0.0502810 0.00156525
## 12 0.00673452     11 0.0424325 0.0433440 0.00145719
## 13 0.00546863     12 0.0356980 0.0368626 0.00134720
## 14 0.00379766     13 0.0302294 0.0316978 0.00125175
## 15 0.00244738     14 0.0264317 0.0261279 0.00113889
## 16 0.00212669     17 0.0190896 0.0218239 0.00104258
## 17 0.00136716     18 0.0169629 0.0193428 0.00098246
## 18 0.00101271     20 0.0142286 0.0160514 0.00089610
## 19 0.00086080     22 0.0122031 0.0150894 0.00086914
## 20 0.00081017     23 0.0113423 0.0138235 0.00083229
## 21 0.00073421     24 0.0105322 0.0135197 0.00082319
## 22 0.00064138     26 0.0090638 0.0129120 0.00080466
## 23 0.00055699     29 0.0071396 0.0107347 0.00073429
## 24 0.00050635     30 0.0065826 0.0102284 0.00071690
## 25 0.00040508     31 0.0060763 0.0094182 0.00068813
## 26 0.00035445     32 0.0056712 0.0074434 0.00061221
## 27 0.00024305     35 0.0046078 0.0062788 0.00056252
## 28 0.00020254     40 0.0033926 0.0049623 0.00050033
## 29 0.00015191     46 0.0021773 0.0042534 0.00046334
## 30 0.00010127     48 0.0018735 0.0038989 0.00044367
## 31 0.00010000     49 0.0017722 0.0038483 0.00044079

6.3 Visualisasi Pohon Keputusan

rpart.plot(
  model_dt,
  type          = 4,
  extra         = 104,
  fallen.leaves = TRUE,
  main          = "Decision Tree - Estimasi Penghuni Ruangan",
  box.palette   = list("#4472C4", "#ED7D31", "#70AD47", "#FF0000"),
  shadow.col    = "gray",
  cex           = 0.7
)

6.4 Evaluasi Decision Tree

pred_dt <- predict(model_dt, X_uji, type = "class")
# pred_dt sudah faktor dengan level LEVELS karena data_latih_s punya level yg sama
hasil_dt <- evaluasi(y_uji, pred_dt, "Decision Tree")

## 
## ========== Decision Tree ==========
##   Akurasi  : 0.9931  (99.31%)
##   Presisi  : 0.9748
##   Recall   : 0.9743
##   F1-Score : 0.9745
## 
## Confusion Matrix:
##           Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
##    kelas_0    1644       0       0       2
##    kelas_1       0      88       4       0
##    kelas_2       0       2     143       2
##    kelas_3       1       1       2     134

7. Model 2 — Regresi Logistik Multinomial

7.1 Hyperparameter Tuning

decay (regularisasi L2): mencegah overfitting dengan memberikan penalti pada koefisien yang besar.

set.seed(SEED)

grid_rl   <- expand.grid(decay = c(0.0001, 0.001, 0.01, 0.05, 0.1, 0.5))

tuning_rl <- train(
  Room_Occupancy_Count ~ .,
  data      = data_latih_std,   # data terstandarisasi, level "kelas_0"–"kelas_3"
  method    = "multinom",
  trControl = ctrl_cv,
  tuneGrid  = grid_rl,
  maxit     = 200,
  MaxNWts   = 5000,
  trace     = FALSE,
  metric    = "Accuracy"
)

cat("Hasil Tuning Regresi Logistik:\n")

## Hasil Tuning Regresi Logistik:

print(tuning_rl$results[, c("decay", "Accuracy", "Kappa")])

##   decay  Accuracy     Kappa
## 1 1e-04 0.9899362 0.9865815
## 2 1e-03 0.9896324 0.9861765
## 3 1e-02 0.9885309 0.9847079
## 4 5e-02 0.9880752 0.9841003
## 5 1e-01 0.9876954 0.9835939
## 6 5e-01 0.9864422 0.9819229

cat("\nDecay terbaik:", tuning_rl$bestTune$decay, "\n")

## 
## Decay terbaik: 1e-04

plot(tuning_rl, main = "Tuning Regresi Logistik — Akurasi vs Decay")

7.2 Latih Model Final

set.seed(SEED)
decay_terbaik <- tuning_rl$bestTune$decay

suppressMessages(
  model_rl <- multinom(
    Room_Occupancy_Count ~ .,
    data    = data_latih_std,
    MaxNWts = 5000,
    maxit   = 200,
    trace   = FALSE,
    decay   = decay_terbaik
  )
)

cat("Regresi Logistik dilatih dengan decay =", decay_terbaik, "\n")

## Regresi Logistik dilatih dengan decay = 1e-04

7.3 Evaluasi Regresi Logistik

pred_rl <- predict(model_rl, X_uji_std, type = "class")
# predict() multinom mengembalikan faktor/character dengan level "kelas_0"–"kelas_3"
# fungsi evaluasi akan memaksa ke LEVELS yang benar
hasil_rl <- evaluasi(y_uji, pred_rl, "Regresi Logistik")

## 
## ========== Regresi Logistik ==========
##   Akurasi  : 0.9960  (99.60%)
##   Presisi  : 0.9900
##   Recall   : 0.9827
##   F1-Score : 0.9861
## 
## Confusion Matrix:
##           Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
##    kelas_0    1645       0       0       2
##    kelas_1       0      88       0       0
##    kelas_2       0       3     149       3
##    kelas_3       0       0       0     133

8. Model 3 — Random Forest

8.1 Hyperparameter Tuning

mtry: jumlah fitur yang dipertimbangkan di setiap pemisahan node.
Nilai kecil → pohon lebih beragam,
nilai besar → tiap pohon lebih kuat tapi lebih berkorelasi satu sama lain.

set.seed(SEED)

grid_rf   <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12))

tuning_rf <- train(
  Room_Occupancy_Count ~ .,
  data      = data_latih_s,   # data setelah SMOTE, level "kelas_0"–"kelas_3"
  method    = "rf",
  trControl = ctrl_cv,
  tuneGrid  = grid_rf,
  ntree     = 100,
  metric    = "Accuracy"
)

cat("Hasil Tuning Random Forest:\n")

## Hasil Tuning Random Forest:

print(tuning_rf$results[, c("mtry", "Accuracy", "Kappa")])

##   mtry  Accuracy     Kappa
## 1    2 0.9994684 0.9992912
## 2    4 0.9994684 0.9992911
## 3    6 0.9995063 0.9993418
## 4    8 0.9995063 0.9993418
## 5   10 0.9992785 0.9990380
## 6   12 0.9992025 0.9989367

cat("\nmtry terbaik:", tuning_rf$bestTune$mtry, "\n")

## 
## mtry terbaik: 6

plot(tuning_rf, main = "Tuning Random Forest — Akurasi vs mtry")

8.2 Latih Model Final

set.seed(SEED)
mtry_terbaik <- tuning_rf$bestTune$mtry

model_rf <- randomForest(
  Room_Occupancy_Count ~ .,
  data       = data_latih_s,
  ntree      = 300,
  mtry       = mtry_terbaik,
  importance = TRUE
)

print(model_rf)

## 
## Call:
##  randomForest(formula = Room_Occupancy_Count ~ ., data = data_latih_s,      ntree = 300, mtry = mtry_terbaik, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 0.05%
## Confusion matrix:
##         kelas_0 kelas_1 kelas_2 kelas_3  class.error
## kelas_0    6582       1       0       0 0.0001519064
## kelas_1       0    6582       1       0 0.0001519064
## kelas_2       0       0    6580       3 0.0004557193
## kelas_3       1       0       6    6576 0.0010633450

8.3 Feature Importance

imp_mat <- importance(model_rf)
df_imp  <- data.frame(
  Fitur      = rownames(imp_mat),
  Pentingnya = imp_mat[, "MeanDecreaseAccuracy"]
) %>% arrange(desc(Pentingnya))

ggplot(df_imp, aes(x = reorder(Fitur, Pentingnya), y = Pentingnya,
                    fill = Pentingnya)) +
  geom_col(color = "black", width = 0.7) +
  coord_flip() +
  scale_fill_gradient(low = "#A9D18E", high = "#1F4E79") +
  labs(title    = "Pentingnya Fitur — Random Forest",
       subtitle = "Berdasarkan Mean Decrease Accuracy",
       x = "Variabel Sensor", y = "Mean Decrease Accuracy") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"), legend.position = "none")

8.4 Evaluasi Random Forest

pred_rf <- predict(model_rf, X_uji, type = "class")
# predict() randomForest mengembalikan faktor dengan level dari data latih ("kelas_0"–"kelas_3")
hasil_rf <- evaluasi(y_uji, pred_rf, "Random Forest")

## 
## ========== Random Forest ==========
##   Akurasi  : 0.9975  (99.75%)
##   Presisi  : 0.9909
##   Recall   : 0.9904
##   F1-Score : 0.9906
## 
## Confusion Matrix:
##           Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
##    kelas_0    1645       0       0       1
##    kelas_1       0      90       2       0
##    kelas_2       0       1     146       0
##    kelas_3       0       0       1     137

9. Perbandingan Ketiga Model

Tabel Perbandingan

tabel_hasil <- data.frame(
  Model             = c("Decision Tree", "Regresi Logistik", "Random Forest"),
  Hyperparameter    = c(
    paste0("cp = ",    cp_terbaik),
    paste0("decay = ", decay_terbaik),
    paste0("mtry = ",  mtry_terbaik)
  ),
  Akurasi  = round(c(hasil_dt$akurasi, hasil_rl$akurasi, hasil_rf$akurasi), 4),
  Presisi  = round(c(hasil_dt$presisi, hasil_rl$presisi, hasil_rf$presisi), 4),
  Recall   = round(c(hasil_dt$recall,  hasil_rl$recall,  hasil_rf$recall),  4),
  F1_Score = round(c(hasil_dt$f1,      hasil_rl$f1,      hasil_rf$f1),      4)
)

print(tabel_hasil)

##              Model Hyperparameter Akurasi Presisi Recall F1_Score
## 1    Decision Tree     cp = 1e-04  0.9931  0.9748 0.9743   0.9745
## 2 Regresi Logistik  decay = 1e-04  0.9960  0.9900 0.9827   0.9861
## 3    Random Forest       mtry = 6  0.9975  0.9909 0.9904   0.9906

Grafik Perbandingan

df_plot <- melt(tabel_hasil[, c("Model","Akurasi","Presisi","Recall","F1_Score")],
                id.vars = "Model", variable.name = "Metrik", value.name = "Nilai")

ggplot(df_plot, aes(x = Model, y = Nilai, fill = Model)) +
  geom_col(color = "black", width = 0.6) +
  geom_text(aes(label = sprintf("%.3f", Nilai)),
            vjust = -0.4, size = 3.8, fontface = "bold") +
  facet_wrap(~Metrik, ncol = 4) +
  scale_fill_manual(values = c(
    "Decision Tree"    = "#4472C4",
    "Regresi Logistik" = "#ED7D31",
    "Random Forest"    = "#70AD47"
  )) +
  scale_y_continuous(limits = c(0, 1.15),
                     labels = percent_format(accuracy = 1)) +
  labs(title    = "Perbandingan Performa Ketiga Model",
       subtitle = paste0("SMOTE + Hyperparameter Tuning (5-Fold CV) | Seed = ", SEED),
       x = NULL, y = "Nilai Metrik", fill = "Model") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x    = element_blank(),
        axis.ticks.x   = element_blank(),
        strip.text     = element_text(face = "bold"),
        plot.title     = element_text(face = "bold"),
        legend.position = "bottom")

Confusion Matrix Visual — Model Terbaik

terbaik_idx  <- which.max(tabel_hasil$F1_Score)
nama_terbaik <- tabel_hasil$Model[terbaik_idx]

cm_terbaik <- switch(nama_terbaik,
  "Decision Tree"    = hasil_dt$cm,
  "Regresi Logistik" = hasil_rl$cm,
  "Random Forest"    = hasil_rf$cm
)

df_cm <- as.data.frame(cm_terbaik$table)
names(df_cm) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(df_cm, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(color = "white", linewidth = 1.2) +
  geom_text(aes(label = Frekuensi), color = "black",
            size = 6, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#1F4E79") +
  labs(title    = paste0("Confusion Matrix — ", nama_terbaik, " (Model Terbaik)"),
       subtitle = "Baris = Prediksi | Kolom = Nilai Aktual",
       x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Jumlah") +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold"))

10. Ringkasan Akhir

terbaik <- tabel_hasil %>% arrange(desc(F1_Score)) %>% slice(1)

cat("============================================================\n")

## ============================================================

cat(sprintf("  SEED                   : %d\n", SEED))

##   SEED                   : 42

cat(sprintf("  Penanganan Imbalance   : SMOTE (K=5)\n"))

##   Penanganan Imbalance   : SMOTE (K=5)

cat(sprintf("  Evaluasi Tuning        : 5-Fold Stratified CV\n"))

##   Evaluasi Tuning        : 5-Fold Stratified CV

cat("------------------------------------------------------------\n")

## ------------------------------------------------------------

cat(sprintf("  MODEL TERBAIK          : %s\n",   terbaik$Model))

##   MODEL TERBAIK          : Random Forest

cat(sprintf("  Hyperparameter Terbaik : %s\n",   terbaik$Hyperparameter))

##   Hyperparameter Terbaik : mtry = 6

cat(sprintf("  Akurasi                : %.4f (%.2f%%)\n",
            terbaik$Akurasi, terbaik$Akurasi * 100))

##   Akurasi                : 0.9975 (99.75%)

cat(sprintf("  F1-Score               : %.4f\n", terbaik$F1_Score))

##   F1-Score               : 0.9906

cat("============================================================\n")

## ============================================================

Occupancy Prediction— Pra-pemrosesan & Pemodelan

Decision Tree | Regresi Logistik | Random Forest

Kelompok G

2026-04-21

1. Muat Library

2. Penentuan Seed & Load Data

3. Pra-pemrosesan

3.1 Hapus Kolom Waktu & Konversi Target

3.2 Split Data — 80% Latih / 20% Uji

3.3 Penanganan Ketidakseimbangan Kelas (SMOTE)

3.4 Standardisasi (khusus Regresi Logistik)

4. Fungsi Evaluasi

5. Setup Cross-Validation (untuk Hyperparameter Tuning)

6. Model 1 — Decision Tree

6.1 Hyperparameter Tuning

6.2 Latih Model Final

6.3 Visualisasi Pohon Keputusan

6.4 Evaluasi Decision Tree

7. Model 2 — Regresi Logistik Multinomial

7.1 Hyperparameter Tuning

7.2 Latih Model Final

7.3 Evaluasi Regresi Logistik

8. Model 3 — Random Forest

8.1 Hyperparameter Tuning

8.2 Latih Model Final

8.3 Feature Importance

8.4 Evaluasi Random Forest

9. Perbandingan Ketiga Model

Tabel Perbandingan

Grafik Perbandingan

Confusion Matrix Visual — Model Terbaik

10. Ringkasan Akhir