1. Muat Library

library(rpart)        # Decision Tree
library(rpart.plot)   # Visualisasi Decision Tree
library(nnet)         # Regresi Logistik Multinomial
library(randomForest) # Random Forest
library(caret)        # CV, tuning, confusion matrix
library(pROC)         # AUC-ROC
library(ggplot2)      # Visualisasi
library(dplyr)        # Manipulasi data
library(reshape2)     # Melt untuk plot
library(scales)       # Format label

# install.packages(c("rpart","rpart.plot","nnet","randomForest",
#                    "caret","pROC","ggplot2","dplyr","reshape2","scales"))

2. Penentuan Seed & Load Data

Seed digunakan agar seluruh proses yang melibatkan keacakan (split data, cross-validation, model berbasis pohon) menghasilkan hasil yang sama setiap kali dijalankan ulang.

SEED <- 42
set.seed(SEED)
cat("Seed aktif:", SEED, "\n")
## Seed aktif: 42
df <- read.csv("Occupancy_Estimation.csv",
               stringsAsFactors = FALSE)
cat("Dataset dimuat:", nrow(df), "baris x", ncol(df), "kolom\n")
## Dataset dimuat: 10129 baris x 19 kolom
cat("Missing values :", sum(is.na(df)), "\n")
## Missing values : 0

3. Pra-pemrosesan

3.1 Hapus Kolom Waktu & Konversi Target

df_bersih <- df %>% select(-Date, -Time)

# Gunakan prefix "kelas_" agar level menjadi valid R variable name
# (caret membutuhkan level faktor yang valid sebagai nama variabel)
LEVELS <- c("kelas_0", "kelas_1", "kelas_2", "kelas_3")

df_bersih$Room_Occupancy_Count <- factor(
  paste0("kelas_", df_bersih$Room_Occupancy_Count),
  levels = LEVELS
)

cat("Fitur prediktor :", ncol(df_bersih) - 1, "\n")
## Fitur prediktor : 16
cat("Kelas target    :", levels(df_bersih$Room_Occupancy_Count), "\n")
## Kelas target    : kelas_0 kelas_1 kelas_2 kelas_3

3.2 Split Data — 80% Latih / 20% Uji

set.seed(SEED)
idx_latih  <- createDataPartition(df_bersih$Room_Occupancy_Count,
                                   p = 0.8, list = FALSE)
data_latih <- df_bersih[ idx_latih, ]
data_uji   <- df_bersih[-idx_latih, ]

# Variabel uji — digunakan oleh SEMUA model
X_uji <- data_uji %>% select(-Room_Occupancy_Count)
y_uji <- data_uji$Room_Occupancy_Count

cat("Data latih :", nrow(data_latih), "observasi\n")
## Data latih : 8106 observasi
cat("Data uji   :", nrow(data_uji),   "observasi\n")
## Data uji   : 2023 observasi
cat("\nDistribusi kelas — Data Latih:\n")
## 
## Distribusi kelas — Data Latih:
print(table(data_latih$Room_Occupancy_Count))
## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    6583     368     599     556
cat("\nDistribusi kelas — Data Uji:\n")
## 
## Distribusi kelas — Data Uji:
print(table(data_uji$Room_Occupancy_Count))
## 
## kelas_0 kelas_1 kelas_2 kelas_3 
##    1645      91     149     138
# Visualisasi distribusi kelas data latih (tanpa SMOTE)
df_dist_vis <- data.frame(
  Kelas = as.character(data_latih$Room_Occupancy_Count)
)

ggplot(df_dist_vis, aes(x = Kelas, fill = Kelas)) +
  geom_bar(color = "black", width = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.4, size = 3.8, fontface = "bold") +
  scale_fill_manual(values = c("kelas_0"="#4472C4","kelas_1"="#ED7D31",
                                "kelas_2"="#70AD47","kelas_3"="#E74C3C")) +
  labs(title    = "Distribusi Kelas Data Latih (Tanpa SMOTE)",
       subtitle = "Data asli tanpa oversampling",
       x = "Kelas Penghuni", y = "Frekuensi", fill = "Kelas") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"), legend.position = "none")

3.3 Standardisasi (khusus Regresi Logistik)

Standardisasi hanya diterapkan untuk Regresi Logistik Multinomial karena algoritma ini sensitif terhadap skala fitur. Decision Tree dan Random Forest tidak memerlukan standardisasi.

Parameter mean & sd dihitung dari data latih saja, lalu diterapkan ke data uji untuk menghindari data leakage.

# Fitur kontinu yang akan distandarisasi
# (kecuali variabel biner PIR dan kolom target)
fitur_std <- setdiff(names(data_latih),
                      c("Room_Occupancy_Count", "S6_PIR", "S7_PIR"))

# Hitung mean & sd DARI DATA LATIH saja
mu_lat <- colMeans(data_latih[, fitur_std])
sd_lat <- apply(data_latih[, fitur_std], 2, sd)
sd_lat[sd_lat == 0] <- 1   # hindari bagi nol

# Terapkan ke data latih → data_latih_std (khusus Regresi Logistik)
data_latih_std <- data_latih
data_latih_std[, fitur_std] <- scale(data_latih[, fitur_std],
                                      center = mu_lat, scale = sd_lat)

# Terapkan ke data uji (pakai mu & sd dari latih, bukan dari uji!)
X_uji_std <- X_uji
X_uji_std[, fitur_std] <- scale(X_uji[, fitur_std],
                                  center = mu_lat, scale = sd_lat)

cat("Standardisasi selesai.\n")
## Standardisasi selesai.
cat("Fitur yang distandarisasi:", length(fitur_std), "fitur\n")
## Fitur yang distandarisasi: 14 fitur
cat("Contoh mean fitur latih (5 pertama):\n")
## Contoh mean fitur latih (5 pertama):
print(round(head(mu_lat, 5), 3))
##  S1_Temp  S2_Temp  S3_Temp  S4_Temp S1_Light 
##   25.454   25.548   25.058   25.755   25.650

4. Fungsi Evaluasi

# Fungsi terpusat untuk menghitung & menampilkan semua metrik
# Selalu memastikan level prediksi = level aktual = LEVELS
evaluasi <- function(y_aktual, y_pred, nama_model) {

  # Paksa kedua vektor menjadi faktor dengan level yang sama persis
  y_aktual <- factor(as.character(y_aktual), levels = LEVELS)
  y_pred   <- factor(as.character(y_pred),   levels = LEVELS)

  cm  <- confusionMatrix(y_pred, y_aktual)
  acc <- as.numeric(cm$overall["Accuracy"])
  pre <- mean(cm$byClass[, "Precision"], na.rm = TRUE)
  rec <- mean(cm$byClass[, "Recall"],    na.rm = TRUE)
  f1  <- mean(cm$byClass[, "F1"],        na.rm = TRUE)

  cat(sprintf("\n========== %s ==========\n", nama_model))
  cat(sprintf("  Akurasi  : %.4f  (%.2f%%)\n", acc, acc * 100))
  cat(sprintf("  Presisi  : %.4f\n", pre))
  cat(sprintf("  Recall   : %.4f\n", rec))
  cat(sprintf("  F1-Score : %.4f\n", f1))
  cat("\nConfusion Matrix:\n")
  print(cm$table)

  # Kembalikan list metrik untuk tabel perbandingan
  list(cm = cm, akurasi = acc, presisi = pre, recall = rec, f1 = f1)
}

5. Setup Cross-Validation (untuk Hyperparameter Tuning)

5-Fold CV digunakan untuk memilih hyperparameter terbaik. Setiap fold memiliki proporsi kelas yang representatif sesuai data asli.

set.seed(SEED)

ctrl_cv <- trainControl(
  method          = "cv",
  number          = 5,
  classProbs      = TRUE,
  summaryFunction = multiClassSummary,
  savePredictions = "final",
  verboseIter     = FALSE
)

cat("Cross-validation: 5-Fold CV\n")
## Cross-validation: 5-Fold CV
cat("Metric tuning   : Akurasi\n")
## Metric tuning   : Akurasi

6. Model 1 — Decision Tree

6.1 Hyperparameter Tuning

cp (complexity parameter): mengontrol ukuran pohon.
Nilai kecil → pohon lebih dalam (risiko overfit),
nilai besar → pohon lebih sederhana (risiko underfit).

set.seed(SEED)

grid_dt   <- expand.grid(cp = c(0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05))

tuning_dt <- train(
  Room_Occupancy_Count ~ .,
  data      = data_latih,   # data latih tanpa SMOTE
  method    = "rpart",
  trControl = ctrl_cv,
  tuneGrid  = grid_dt,
  metric    = "Accuracy"
)

cat("Hasil Tuning Decision Tree:\n")
## Hasil Tuning Decision Tree:
print(tuning_dt$results[, c("cp", "Accuracy", "Kappa")])
##      cp  Accuracy     Kappa
## 1 1e-04 0.9925987 0.9774544
## 2 5e-04 0.9925987 0.9774544
## 3 1e-03 0.9925987 0.9774545
## 4 5e-03 0.9913655 0.9735933
## 5 1e-02 0.9888973 0.9660013
## 6 5e-02 0.9738471 0.9204301
cat("\nCP terbaik:", tuning_dt$bestTune$cp, "\n")
## 
## CP terbaik: 0.001
plot(tuning_dt, main = "Tuning Decision Tree — Akurasi vs CP")

6.2 Latih Model Final

set.seed(SEED)
cp_terbaik <- tuning_dt$bestTune$cp

model_dt <- rpart(
  Room_Occupancy_Count ~ .,
  data    = data_latih,
  method  = "class",
  control = rpart.control(cp = cp_terbaik, maxdepth = 15, minsplit = 10)
)

cat("Decision Tree dilatih dengan cp =", cp_terbaik, "\n")
## Decision Tree dilatih dengan cp = 0.001
printcp(model_dt)
## 
## Classification tree:
## rpart(formula = Room_Occupancy_Count ~ ., data = data_latih, 
##     method = "class", control = rpart.control(cp = cp_terbaik, 
##         maxdepth = 15, minsplit = 10))
## 
## Variables actually used in tree construction:
##  [1] S1_Light     S1_Temp      S2_Light     S2_Sound     S2_Temp     
##  [6] S3_Light     S3_Temp      S4_Sound     S4_Temp      S5_CO2      
## [11] S5_CO2_Slope S7_PIR      
## 
## Root node error: 1523/8106 = 0.18789
## 
## n= 8106 
## 
##           CP nsplit rel error   xerror      xstd
## 1  0.3933027      0  1.000000 1.000000 0.0230918
## 2  0.1845043      1  0.606697 0.607354 0.0187958
## 3  0.1260670      2  0.422193 0.422850 0.0159870
## 4  0.1011162      3  0.296126 0.299409 0.0136210
## 5  0.0623769      4  0.195010 0.196980 0.0111602
## 6  0.0374261      5  0.132633 0.143139 0.0095633
## 7  0.0137886      6  0.095207 0.105712 0.0082482
## 8  0.0131320      7  0.081418 0.088641 0.0075652
## 9  0.0105056      8  0.068286 0.073539 0.0069006
## 10 0.0085358      9  0.057781 0.063033 0.0063951
## 11 0.0065660     10  0.049245 0.057124 0.0060914
## 12 0.0039396     12  0.036113 0.049902 0.0056972
## 13 0.0021887     13  0.032173 0.044649 0.0053917
## 14 0.0019698     18  0.020355 0.038083 0.0049826
## 15 0.0013132     19  0.018385 0.037426 0.0049398
## 16 0.0010000     21  0.015758 0.038739 0.0050250

6.3 Visualisasi Pohon Keputusan

rpart.plot(
  model_dt,
  type          = 4,
  extra         = 104,
  fallen.leaves = TRUE,
  main          = "Decision Tree - Estimasi Penghuni Ruangan",
  box.palette   = list("#4472C4", "#ED7D31", "#70AD47", "#FF0000"),
  shadow.col    = "gray",
  cex           = 0.7
)

6.4 Evaluasi Decision Tree

pred_dt  <- predict(model_dt, X_uji, type = "class")
hasil_dt <- evaluasi(y_uji, pred_dt, "Decision Tree")
## 
## ========== Decision Tree ==========
##   Akurasi  : 0.9960  (99.60%)
##   Presisi  : 0.9848
##   Recall   : 0.9840
##   F1-Score : 0.9844
## 
## Confusion Matrix:
##           Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
##    kelas_0    1645       0       0       1
##    kelas_1       0      89       3       0
##    kelas_2       0       1     146       2
##    kelas_3       0       1       0     135

7. Model 2 — Regresi Logistik Multinomial

Catatan: Regresi Logistik menggunakan data_latih_std (data yang telah distandarisasi) agar koefisien model tidak bias akibat perbedaan skala fitur. Data uji juga distandarisasi menggunakan parameter yang sama dari data latih.

7.1 Hyperparameter Tuning

decay (regularisasi L2): mencegah overfitting dengan memberikan penalti pada koefisien yang besar.

set.seed(SEED)

grid_rl   <- expand.grid(decay = c(0.0001, 0.001, 0.01, 0.05, 0.1, 0.5))

tuning_rl <- train(
  Room_Occupancy_Count ~ .,
  data      = data_latih_std,   # data latih yang sudah distandarisasi
  method    = "multinom",
  trControl = ctrl_cv,
  tuneGrid  = grid_rl,
  maxit     = 200,
  MaxNWts   = 5000,
  trace     = FALSE,
  metric    = "Accuracy"
)

cat("Hasil Tuning Regresi Logistik:\n")
## Hasil Tuning Regresi Logistik:
print(tuning_rl$results[, c("decay", "Accuracy", "Kappa")])
##   decay  Accuracy     Kappa
## 1 1e-04 0.9938320 0.9812253
## 2 1e-03 0.9944486 0.9831144
## 3 1e-02 0.9950655 0.9849855
## 4 5e-02 0.9940786 0.9819768
## 5 1e-01 0.9937085 0.9808430
## 6 5e-01 0.9909948 0.9725448
cat("\nDecay terbaik:", tuning_rl$bestTune$decay, "\n")
## 
## Decay terbaik: 0.01
plot(tuning_rl, main = "Tuning Regresi Logistik — Akurasi vs Decay")

7.2 Latih Model Final

set.seed(SEED)
decay_terbaik <- tuning_rl$bestTune$decay

suppressMessages(
  model_rl <- multinom(
    Room_Occupancy_Count ~ .,
    data    = data_latih_std,   # data latih yang sudah distandarisasi
    MaxNWts = 5000,
    maxit   = 200,
    trace   = FALSE,
    decay   = decay_terbaik
  )
)

cat("Regresi Logistik dilatih dengan decay =", decay_terbaik, "\n")
## Regresi Logistik dilatih dengan decay = 0.01

7.3 Evaluasi Regresi Logistik

pred_rl  <- predict(model_rl, X_uji_std, type = "class")   # gunakan X_uji_std
hasil_rl <- evaluasi(y_uji, pred_rl, "Regresi Logistik")
## 
## ========== Regresi Logistik ==========
##   Akurasi  : 0.9960  (99.60%)
##   Presisi  : 0.9914
##   Recall   : 0.9846
##   F1-Score : 0.9878
## 
## Confusion Matrix:
##           Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
##    kelas_0    1645       0       0       3
##    kelas_1       0      90       0       0
##    kelas_2       0       1     149       4
##    kelas_3       0       0       0     131

8. Model 3 — Random Forest

8.1 Hyperparameter Tuning

mtry: jumlah fitur yang dipertimbangkan di setiap pemisahan node.
Nilai kecil → pohon lebih beragam,
nilai besar → tiap pohon lebih kuat tapi lebih berkorelasi satu sama lain.

set.seed(SEED)

grid_rf   <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12))

tuning_rf <- train(
  Room_Occupancy_Count ~ .,
  data      = data_latih,   # data latih tanpa SMOTE
  method    = "rf",
  trControl = ctrl_cv,
  tuneGrid  = grid_rf,
  ntree     = 100,
  metric    = "Accuracy"
)

cat("Hasil Tuning Random Forest:\n")
## Hasil Tuning Random Forest:
print(tuning_rf$results[, c("mtry", "Accuracy", "Kappa")])
##   mtry  Accuracy     Kappa
## 1    2 0.9974095 0.9921062
## 2    4 0.9975328 0.9924841
## 3    6 0.9976561 0.9928543
## 4    8 0.9974093 0.9921018
## 5   10 0.9975326 0.9924772
## 6   12 0.9975325 0.9924802
cat("\nmtry terbaik:", tuning_rf$bestTune$mtry, "\n")
## 
## mtry terbaik: 6
plot(tuning_rf, main = "Tuning Random Forest — Akurasi vs mtry")

8.2 Latih Model Final

set.seed(SEED)
mtry_terbaik <- tuning_rf$bestTune$mtry

# ntree ditingkatkan ke 300 pada model final untuk stabilitas prediksi
model_rf <- randomForest(
  Room_Occupancy_Count ~ .,
  data       = data_latih,
  ntree      = 300,
  mtry       = mtry_terbaik,
  importance = TRUE
)

print(model_rf)
## 
## Call:
##  randomForest(formula = Room_Occupancy_Count ~ ., data = data_latih,      ntree = 300, mtry = mtry_terbaik, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 0.22%
## Confusion matrix:
##         kelas_0 kelas_1 kelas_2 kelas_3  class.error
## kelas_0    6582       1       0       0 0.0001519064
## kelas_1       0     366       2       0 0.0054347826
## kelas_2       0       0     592       7 0.0116861436
## kelas_3       1       0       7     548 0.0143884892

8.3 Feature Importance

imp_mat <- importance(model_rf)
df_imp  <- data.frame(
  Fitur      = rownames(imp_mat),
  Pentingnya = imp_mat[, "MeanDecreaseAccuracy"]
) %>% arrange(desc(Pentingnya))

ggplot(df_imp, aes(x = reorder(Fitur, Pentingnya), y = Pentingnya,
                    fill = Pentingnya)) +
  geom_col(color = "black", width = 0.7) +
  coord_flip() +
  scale_fill_gradient(low = "#A9D18E", high = "#1F4E79") +
  labs(title    = "Pentingnya Fitur — Random Forest",
       subtitle = "Berdasarkan Mean Decrease Accuracy",
       x = "Variabel Sensor", y = "Mean Decrease Accuracy") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"), legend.position = "none")

8.4 Evaluasi Random Forest

pred_rf  <- predict(model_rf, X_uji, type = "class")
hasil_rf <- evaluasi(y_uji, pred_rf, "Random Forest")
## 
## ========== Random Forest ==========
##   Akurasi  : 0.9975  (99.75%)
##   Presisi  : 0.9911
##   Recall   : 0.9903
##   F1-Score : 0.9906
## 
## Confusion Matrix:
##           Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
##    kelas_0    1645       0       0       1
##    kelas_1       0      90       2       0
##    kelas_2       0       1     147       1
##    kelas_3       0       0       0     136

9. Perbandingan Ketiga Model

Tabel Perbandingan

tabel_hasil <- data.frame(
  Model             = c("Decision Tree", "Regresi Logistik", "Random Forest"),
  Hyperparameter    = c(
    paste0("cp = ",    formatC(cp_terbaik,    format = "f", digits = 4)),
    paste0("decay = ", formatC(decay_terbaik, format = "f", digits = 4)),
    paste0("mtry = ",  mtry_terbaik)
  ),
  Akurasi  = round(c(hasil_dt$akurasi, hasil_rl$akurasi, hasil_rf$akurasi), 4),
  Presisi  = round(c(hasil_dt$presisi, hasil_rl$presisi, hasil_rf$presisi), 4),
  Recall   = round(c(hasil_dt$recall,  hasil_rl$recall,  hasil_rf$recall),  4),
  F1_Score = round(c(hasil_dt$f1,      hasil_rl$f1,      hasil_rf$f1),      4)
)

print(tabel_hasil)
##              Model Hyperparameter Akurasi Presisi Recall F1_Score
## 1    Decision Tree    cp = 0.0010  0.9960  0.9848 0.9840   0.9844
## 2 Regresi Logistik decay = 0.0100  0.9960  0.9914 0.9846   0.9878
## 3    Random Forest       mtry = 6  0.9975  0.9911 0.9903   0.9906

Grafik Perbandingan

df_plot <- melt(tabel_hasil[, c("Model","Akurasi","Presisi","Recall","F1_Score")],
                id.vars = "Model", variable.name = "Metrik", value.name = "Nilai")

ggplot(df_plot, aes(x = Model, y = Nilai, fill = Model)) +
  geom_col(color = "black", width = 0.6) +
  geom_text(aes(label = sprintf("%.3f", Nilai)),
            vjust = -0.4, size = 3.8, fontface = "bold") +
  facet_wrap(~Metrik, ncol = 4) +
  scale_fill_manual(values = c(
    "Decision Tree"    = "#4472C4",
    "Regresi Logistik" = "#ED7D31",
    "Random Forest"    = "#70AD47"
  )) +
  scale_y_continuous(limits = c(0, 1.15),
                     labels = percent_format(accuracy = 1)) +
  labs(title    = "Perbandingan Performa Ketiga Model",
       subtitle = paste0("Tanpa SMOTE | Hyperparameter Tuning (5-Fold CV) | Seed = ", SEED),
       x = NULL, y = "Nilai Metrik", fill = "Model") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x    = element_blank(),
        axis.ticks.x   = element_blank(),
        strip.text     = element_text(face = "bold"),
        plot.title     = element_text(face = "bold"),
        legend.position = "bottom")

Confusion Matrix Visual — Model Terbaik

terbaik_idx  <- which.max(tabel_hasil$F1_Score)
nama_terbaik <- tabel_hasil$Model[terbaik_idx]

cm_terbaik <- switch(nama_terbaik,
  "Decision Tree"    = hasil_dt$cm,
  "Regresi Logistik" = hasil_rl$cm,
  "Random Forest"    = hasil_rf$cm
)

df_cm <- as.data.frame(cm_terbaik$table)
names(df_cm) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(df_cm, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(color = "white", linewidth = 1.2) +
  geom_text(aes(label = Frekuensi), color = "black",
            size = 6, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#1F4E79") +
  labs(title    = paste0("Confusion Matrix — ", nama_terbaik, " (Model Terbaik)"),
       subtitle = "Baris = Prediksi | Kolom = Nilai Aktual",
       x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Jumlah") +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold"))


10. Ringkasan Akhir

terbaik <- tabel_hasil %>% arrange(desc(F1_Score)) %>% slice(1)

cat("============================================================\n")
## ============================================================
cat(sprintf("  SEED                   : %d\n", SEED))
##   SEED                   : 42
cat(sprintf("  Penanganan Imbalance   : Tidak ada (Tanpa SMOTE)\n"))
##   Penanganan Imbalance   : Tidak ada (Tanpa SMOTE)
cat(sprintf("  Evaluasi Tuning        : 5-Fold CV\n"))
##   Evaluasi Tuning        : 5-Fold CV
cat("------------------------------------------------------------\n")
## ------------------------------------------------------------
cat(sprintf("  MODEL TERBAIK          : %s\n",   terbaik$Model))
##   MODEL TERBAIK          : Random Forest
cat(sprintf("  Hyperparameter Terbaik : %s\n",   terbaik$Hyperparameter))
##   Hyperparameter Terbaik : mtry = 6
cat(sprintf("  Akurasi                : %.4f (%.2f%%)\n",
            terbaik$Akurasi, terbaik$Akurasi * 100))
##   Akurasi                : 0.9975 (99.75%)
cat(sprintf("  F1-Score               : %.4f\n", terbaik$F1_Score))
##   F1-Score               : 0.9906
cat("============================================================\n")
## ============================================================