library(rpart) # Decision Tree
library(rpart.plot) # Visualisasi Decision Tree
library(nnet) # Regresi Logistik Multinomial
library(randomForest) # Random Forest
library(caret) # CV, tuning, confusion matrix
library(pROC) # AUC-ROC
library(ggplot2) # Visualisasi
library(dplyr) # Manipulasi data
library(reshape2) # Melt untuk plot
library(scales) # Format label
# install.packages(c("rpart","rpart.plot","nnet","randomForest",
# "caret","pROC","ggplot2","dplyr","reshape2","scales"))Seed digunakan agar seluruh proses yang melibatkan keacakan (split data, cross-validation, model berbasis pohon) menghasilkan hasil yang sama setiap kali dijalankan ulang.
## Seed aktif: 42
df <- read.csv("Occupancy_Estimation.csv",
stringsAsFactors = FALSE)
cat("Dataset dimuat:", nrow(df), "baris x", ncol(df), "kolom\n")## Dataset dimuat: 10129 baris x 19 kolom
## Missing values : 0
df_bersih <- df %>% select(-Date, -Time)
# Gunakan prefix "kelas_" agar level menjadi valid R variable name
# (caret membutuhkan level faktor yang valid sebagai nama variabel)
LEVELS <- c("kelas_0", "kelas_1", "kelas_2", "kelas_3")
df_bersih$Room_Occupancy_Count <- factor(
paste0("kelas_", df_bersih$Room_Occupancy_Count),
levels = LEVELS
)
cat("Fitur prediktor :", ncol(df_bersih) - 1, "\n")## Fitur prediktor : 16
## Kelas target : kelas_0 kelas_1 kelas_2 kelas_3
set.seed(SEED)
idx_latih <- createDataPartition(df_bersih$Room_Occupancy_Count,
p = 0.8, list = FALSE)
data_latih <- df_bersih[ idx_latih, ]
data_uji <- df_bersih[-idx_latih, ]
# Variabel uji — digunakan oleh SEMUA model
X_uji <- data_uji %>% select(-Room_Occupancy_Count)
y_uji <- data_uji$Room_Occupancy_Count
cat("Data latih :", nrow(data_latih), "observasi\n")## Data latih : 8106 observasi
## Data uji : 2023 observasi
##
## Distribusi kelas — Data Latih:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 368 599 556
##
## Distribusi kelas — Data Uji:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 1645 91 149 138
# Visualisasi distribusi kelas data latih (tanpa SMOTE)
df_dist_vis <- data.frame(
Kelas = as.character(data_latih$Room_Occupancy_Count)
)
ggplot(df_dist_vis, aes(x = Kelas, fill = Kelas)) +
geom_bar(color = "black", width = 0.6) +
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.4, size = 3.8, fontface = "bold") +
scale_fill_manual(values = c("kelas_0"="#4472C4","kelas_1"="#ED7D31",
"kelas_2"="#70AD47","kelas_3"="#E74C3C")) +
labs(title = "Distribusi Kelas Data Latih (Tanpa SMOTE)",
subtitle = "Data asli tanpa oversampling",
x = "Kelas Penghuni", y = "Frekuensi", fill = "Kelas") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")Standardisasi hanya diterapkan untuk Regresi Logistik Multinomial karena algoritma ini sensitif terhadap skala fitur. Decision Tree dan Random Forest tidak memerlukan standardisasi.
Parameter mean & sd dihitung dari data latih saja, lalu diterapkan ke data uji untuk menghindari data leakage.
# Fitur kontinu yang akan distandarisasi
# (kecuali variabel biner PIR dan kolom target)
fitur_std <- setdiff(names(data_latih),
c("Room_Occupancy_Count", "S6_PIR", "S7_PIR"))
# Hitung mean & sd DARI DATA LATIH saja
mu_lat <- colMeans(data_latih[, fitur_std])
sd_lat <- apply(data_latih[, fitur_std], 2, sd)
sd_lat[sd_lat == 0] <- 1 # hindari bagi nol
# Terapkan ke data latih → data_latih_std (khusus Regresi Logistik)
data_latih_std <- data_latih
data_latih_std[, fitur_std] <- scale(data_latih[, fitur_std],
center = mu_lat, scale = sd_lat)
# Terapkan ke data uji (pakai mu & sd dari latih, bukan dari uji!)
X_uji_std <- X_uji
X_uji_std[, fitur_std] <- scale(X_uji[, fitur_std],
center = mu_lat, scale = sd_lat)
cat("Standardisasi selesai.\n")## Standardisasi selesai.
## Fitur yang distandarisasi: 14 fitur
## Contoh mean fitur latih (5 pertama):
## S1_Temp S2_Temp S3_Temp S4_Temp S1_Light
## 25.454 25.548 25.058 25.755 25.650
# Fungsi terpusat untuk menghitung & menampilkan semua metrik
# Selalu memastikan level prediksi = level aktual = LEVELS
evaluasi <- function(y_aktual, y_pred, nama_model) {
# Paksa kedua vektor menjadi faktor dengan level yang sama persis
y_aktual <- factor(as.character(y_aktual), levels = LEVELS)
y_pred <- factor(as.character(y_pred), levels = LEVELS)
cm <- confusionMatrix(y_pred, y_aktual)
acc <- as.numeric(cm$overall["Accuracy"])
pre <- mean(cm$byClass[, "Precision"], na.rm = TRUE)
rec <- mean(cm$byClass[, "Recall"], na.rm = TRUE)
f1 <- mean(cm$byClass[, "F1"], na.rm = TRUE)
cat(sprintf("\n========== %s ==========\n", nama_model))
cat(sprintf(" Akurasi : %.4f (%.2f%%)\n", acc, acc * 100))
cat(sprintf(" Presisi : %.4f\n", pre))
cat(sprintf(" Recall : %.4f\n", rec))
cat(sprintf(" F1-Score : %.4f\n", f1))
cat("\nConfusion Matrix:\n")
print(cm$table)
# Kembalikan list metrik untuk tabel perbandingan
list(cm = cm, akurasi = acc, presisi = pre, recall = rec, f1 = f1)
}5-Fold CV digunakan untuk memilih hyperparameter terbaik. Setiap fold memiliki proporsi kelas yang representatif sesuai data asli.
set.seed(SEED)
ctrl_cv <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = multiClassSummary,
savePredictions = "final",
verboseIter = FALSE
)
cat("Cross-validation: 5-Fold CV\n")## Cross-validation: 5-Fold CV
## Metric tuning : Akurasi
cp (complexity parameter): mengontrol ukuran pohon.
Nilai kecil → pohon lebih dalam (risiko overfit),
nilai besar → pohon lebih sederhana (risiko underfit).
set.seed(SEED)
grid_dt <- expand.grid(cp = c(0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05))
tuning_dt <- train(
Room_Occupancy_Count ~ .,
data = data_latih, # data latih tanpa SMOTE
method = "rpart",
trControl = ctrl_cv,
tuneGrid = grid_dt,
metric = "Accuracy"
)
cat("Hasil Tuning Decision Tree:\n")## Hasil Tuning Decision Tree:
## cp Accuracy Kappa
## 1 1e-04 0.9925987 0.9774544
## 2 5e-04 0.9925987 0.9774544
## 3 1e-03 0.9925987 0.9774545
## 4 5e-03 0.9913655 0.9735933
## 5 1e-02 0.9888973 0.9660013
## 6 5e-02 0.9738471 0.9204301
##
## CP terbaik: 0.001
set.seed(SEED)
cp_terbaik <- tuning_dt$bestTune$cp
model_dt <- rpart(
Room_Occupancy_Count ~ .,
data = data_latih,
method = "class",
control = rpart.control(cp = cp_terbaik, maxdepth = 15, minsplit = 10)
)
cat("Decision Tree dilatih dengan cp =", cp_terbaik, "\n")## Decision Tree dilatih dengan cp = 0.001
##
## Classification tree:
## rpart(formula = Room_Occupancy_Count ~ ., data = data_latih,
## method = "class", control = rpart.control(cp = cp_terbaik,
## maxdepth = 15, minsplit = 10))
##
## Variables actually used in tree construction:
## [1] S1_Light S1_Temp S2_Light S2_Sound S2_Temp
## [6] S3_Light S3_Temp S4_Sound S4_Temp S5_CO2
## [11] S5_CO2_Slope S7_PIR
##
## Root node error: 1523/8106 = 0.18789
##
## n= 8106
##
## CP nsplit rel error xerror xstd
## 1 0.3933027 0 1.000000 1.000000 0.0230918
## 2 0.1845043 1 0.606697 0.607354 0.0187958
## 3 0.1260670 2 0.422193 0.422850 0.0159870
## 4 0.1011162 3 0.296126 0.299409 0.0136210
## 5 0.0623769 4 0.195010 0.196980 0.0111602
## 6 0.0374261 5 0.132633 0.143139 0.0095633
## 7 0.0137886 6 0.095207 0.105712 0.0082482
## 8 0.0131320 7 0.081418 0.088641 0.0075652
## 9 0.0105056 8 0.068286 0.073539 0.0069006
## 10 0.0085358 9 0.057781 0.063033 0.0063951
## 11 0.0065660 10 0.049245 0.057124 0.0060914
## 12 0.0039396 12 0.036113 0.049902 0.0056972
## 13 0.0021887 13 0.032173 0.044649 0.0053917
## 14 0.0019698 18 0.020355 0.038083 0.0049826
## 15 0.0013132 19 0.018385 0.037426 0.0049398
## 16 0.0010000 21 0.015758 0.038739 0.0050250
rpart.plot(
model_dt,
type = 4,
extra = 104,
fallen.leaves = TRUE,
main = "Decision Tree - Estimasi Penghuni Ruangan",
box.palette = list("#4472C4", "#ED7D31", "#70AD47", "#FF0000"),
shadow.col = "gray",
cex = 0.7
)pred_dt <- predict(model_dt, X_uji, type = "class")
hasil_dt <- evaluasi(y_uji, pred_dt, "Decision Tree")##
## ========== Decision Tree ==========
## Akurasi : 0.9960 (99.60%)
## Presisi : 0.9848
## Recall : 0.9840
## F1-Score : 0.9844
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 1
## kelas_1 0 89 3 0
## kelas_2 0 1 146 2
## kelas_3 0 1 0 135
Catatan: Regresi Logistik menggunakan
data_latih_std(data yang telah distandarisasi) agar koefisien model tidak bias akibat perbedaan skala fitur. Data uji juga distandarisasi menggunakan parameter yang sama dari data latih.
decay (regularisasi L2): mencegah overfitting dengan memberikan penalti pada koefisien yang besar.
set.seed(SEED)
grid_rl <- expand.grid(decay = c(0.0001, 0.001, 0.01, 0.05, 0.1, 0.5))
tuning_rl <- train(
Room_Occupancy_Count ~ .,
data = data_latih_std, # data latih yang sudah distandarisasi
method = "multinom",
trControl = ctrl_cv,
tuneGrid = grid_rl,
maxit = 200,
MaxNWts = 5000,
trace = FALSE,
metric = "Accuracy"
)
cat("Hasil Tuning Regresi Logistik:\n")## Hasil Tuning Regresi Logistik:
## decay Accuracy Kappa
## 1 1e-04 0.9938320 0.9812253
## 2 1e-03 0.9944486 0.9831144
## 3 1e-02 0.9950655 0.9849855
## 4 5e-02 0.9940786 0.9819768
## 5 1e-01 0.9937085 0.9808430
## 6 5e-01 0.9909948 0.9725448
##
## Decay terbaik: 0.01
set.seed(SEED)
decay_terbaik <- tuning_rl$bestTune$decay
suppressMessages(
model_rl <- multinom(
Room_Occupancy_Count ~ .,
data = data_latih_std, # data latih yang sudah distandarisasi
MaxNWts = 5000,
maxit = 200,
trace = FALSE,
decay = decay_terbaik
)
)
cat("Regresi Logistik dilatih dengan decay =", decay_terbaik, "\n")## Regresi Logistik dilatih dengan decay = 0.01
pred_rl <- predict(model_rl, X_uji_std, type = "class") # gunakan X_uji_std
hasil_rl <- evaluasi(y_uji, pred_rl, "Regresi Logistik")##
## ========== Regresi Logistik ==========
## Akurasi : 0.9960 (99.60%)
## Presisi : 0.9914
## Recall : 0.9846
## F1-Score : 0.9878
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 3
## kelas_1 0 90 0 0
## kelas_2 0 1 149 4
## kelas_3 0 0 0 131
mtry: jumlah fitur yang dipertimbangkan di setiap pemisahan node.
Nilai kecil → pohon lebih beragam,
nilai besar → tiap pohon lebih kuat tapi lebih berkorelasi satu sama lain.
set.seed(SEED)
grid_rf <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12))
tuning_rf <- train(
Room_Occupancy_Count ~ .,
data = data_latih, # data latih tanpa SMOTE
method = "rf",
trControl = ctrl_cv,
tuneGrid = grid_rf,
ntree = 100,
metric = "Accuracy"
)
cat("Hasil Tuning Random Forest:\n")## Hasil Tuning Random Forest:
## mtry Accuracy Kappa
## 1 2 0.9974095 0.9921062
## 2 4 0.9975328 0.9924841
## 3 6 0.9976561 0.9928543
## 4 8 0.9974093 0.9921018
## 5 10 0.9975326 0.9924772
## 6 12 0.9975325 0.9924802
##
## mtry terbaik: 6
set.seed(SEED)
mtry_terbaik <- tuning_rf$bestTune$mtry
# ntree ditingkatkan ke 300 pada model final untuk stabilitas prediksi
model_rf <- randomForest(
Room_Occupancy_Count ~ .,
data = data_latih,
ntree = 300,
mtry = mtry_terbaik,
importance = TRUE
)
print(model_rf)##
## Call:
## randomForest(formula = Room_Occupancy_Count ~ ., data = data_latih, ntree = 300, mtry = mtry_terbaik, importance = TRUE)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 0.22%
## Confusion matrix:
## kelas_0 kelas_1 kelas_2 kelas_3 class.error
## kelas_0 6582 1 0 0 0.0001519064
## kelas_1 0 366 2 0 0.0054347826
## kelas_2 0 0 592 7 0.0116861436
## kelas_3 1 0 7 548 0.0143884892
imp_mat <- importance(model_rf)
df_imp <- data.frame(
Fitur = rownames(imp_mat),
Pentingnya = imp_mat[, "MeanDecreaseAccuracy"]
) %>% arrange(desc(Pentingnya))
ggplot(df_imp, aes(x = reorder(Fitur, Pentingnya), y = Pentingnya,
fill = Pentingnya)) +
geom_col(color = "black", width = 0.7) +
coord_flip() +
scale_fill_gradient(low = "#A9D18E", high = "#1F4E79") +
labs(title = "Pentingnya Fitur — Random Forest",
subtitle = "Berdasarkan Mean Decrease Accuracy",
x = "Variabel Sensor", y = "Mean Decrease Accuracy") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")pred_rf <- predict(model_rf, X_uji, type = "class")
hasil_rf <- evaluasi(y_uji, pred_rf, "Random Forest")##
## ========== Random Forest ==========
## Akurasi : 0.9975 (99.75%)
## Presisi : 0.9911
## Recall : 0.9903
## F1-Score : 0.9906
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 1
## kelas_1 0 90 2 0
## kelas_2 0 1 147 1
## kelas_3 0 0 0 136
tabel_hasil <- data.frame(
Model = c("Decision Tree", "Regresi Logistik", "Random Forest"),
Hyperparameter = c(
paste0("cp = ", formatC(cp_terbaik, format = "f", digits = 4)),
paste0("decay = ", formatC(decay_terbaik, format = "f", digits = 4)),
paste0("mtry = ", mtry_terbaik)
),
Akurasi = round(c(hasil_dt$akurasi, hasil_rl$akurasi, hasil_rf$akurasi), 4),
Presisi = round(c(hasil_dt$presisi, hasil_rl$presisi, hasil_rf$presisi), 4),
Recall = round(c(hasil_dt$recall, hasil_rl$recall, hasil_rf$recall), 4),
F1_Score = round(c(hasil_dt$f1, hasil_rl$f1, hasil_rf$f1), 4)
)
print(tabel_hasil)## Model Hyperparameter Akurasi Presisi Recall F1_Score
## 1 Decision Tree cp = 0.0010 0.9960 0.9848 0.9840 0.9844
## 2 Regresi Logistik decay = 0.0100 0.9960 0.9914 0.9846 0.9878
## 3 Random Forest mtry = 6 0.9975 0.9911 0.9903 0.9906
df_plot <- melt(tabel_hasil[, c("Model","Akurasi","Presisi","Recall","F1_Score")],
id.vars = "Model", variable.name = "Metrik", value.name = "Nilai")
ggplot(df_plot, aes(x = Model, y = Nilai, fill = Model)) +
geom_col(color = "black", width = 0.6) +
geom_text(aes(label = sprintf("%.3f", Nilai)),
vjust = -0.4, size = 3.8, fontface = "bold") +
facet_wrap(~Metrik, ncol = 4) +
scale_fill_manual(values = c(
"Decision Tree" = "#4472C4",
"Regresi Logistik" = "#ED7D31",
"Random Forest" = "#70AD47"
)) +
scale_y_continuous(limits = c(0, 1.15),
labels = percent_format(accuracy = 1)) +
labs(title = "Perbandingan Performa Ketiga Model",
subtitle = paste0("Tanpa SMOTE | Hyperparameter Tuning (5-Fold CV) | Seed = ", SEED),
x = NULL, y = "Nilai Metrik", fill = "Model") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
legend.position = "bottom")terbaik_idx <- which.max(tabel_hasil$F1_Score)
nama_terbaik <- tabel_hasil$Model[terbaik_idx]
cm_terbaik <- switch(nama_terbaik,
"Decision Tree" = hasil_dt$cm,
"Regresi Logistik" = hasil_rl$cm,
"Random Forest" = hasil_rf$cm
)
df_cm <- as.data.frame(cm_terbaik$table)
names(df_cm) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(df_cm, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = Frekuensi), color = "black",
size = 6, fontface = "bold") +
scale_fill_gradient(low = "#EBF5FB", high = "#1F4E79") +
labs(title = paste0("Confusion Matrix — ", nama_terbaik, " (Model Terbaik)"),
subtitle = "Baris = Prediksi | Kolom = Nilai Aktual",
x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Jumlah") +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold"))terbaik <- tabel_hasil %>% arrange(desc(F1_Score)) %>% slice(1)
cat("============================================================\n")## ============================================================
## SEED : 42
## Penanganan Imbalance : Tidak ada (Tanpa SMOTE)
## Evaluasi Tuning : 5-Fold CV
## ------------------------------------------------------------
## MODEL TERBAIK : Random Forest
## Hyperparameter Terbaik : mtry = 6
## Akurasi : 0.9975 (99.75%)
## F1-Score : 0.9906
## ============================================================