library(rpart) # Decision Tree
library(rpart.plot) # Visualisasi Decision Tree
library(nnet) # Regresi Logistik Multinomial
library(randomForest) # Random Forest
library(smotefamily) # SMOTE
library(caret) # CV, tuning, confusion matrix
library(pROC) # AUC-ROC
library(ggplot2) # Visualisasi
library(dplyr) # Manipulasi data
library(reshape2) # Melt untuk plot
library(scales) # Format label
# Hapus:
library(smotefamily)
# Tambah:
library(themis)
library(recipes)
# install.packages(c("rpart","rpart.plot","nnet","randomForest",
# "smotefamily","caret","pROC",
# "ggplot2","dplyr","reshape2","scales"))Seed digunakan agar seluruh proses yang melibatkan keacakan (split data, SMOTE, cross-validation, model berbasis pohon) menghasilkan hasil yang sama setiap kali dijalankan ulang.
## Seed aktif: 42
df <- read.csv("Occupancy_Estimation.csv",
stringsAsFactors = FALSE)
cat("Dataset dimuat:", nrow(df), "baris x", ncol(df), "kolom\n")## Dataset dimuat: 10129 baris x 19 kolom
df_bersih <- df %>% select(-Date, -Time)
# Gunakan prefix "kelas_" agar level menjadi valid R variable name
# (caret membutuhkan level faktor yang valid sebagai nama variabel)
LEVELS <- c("kelas_0", "kelas_1", "kelas_2", "kelas_3")
df_bersih$Room_Occupancy_Count <- factor(
paste0("kelas_", df_bersih$Room_Occupancy_Count),
levels = LEVELS
)
cat("Fitur prediktor :", ncol(df_bersih) - 1, "\n")## Fitur prediktor : 16
## Kelas target : kelas_0 kelas_1 kelas_2 kelas_3
set.seed(SEED)
idx_latih <- createDataPartition(df_bersih$Room_Occupancy_Count,
p = 0.8, list = FALSE)
data_latih <- df_bersih[ idx_latih, ]
data_uji <- df_bersih[-idx_latih, ]
# Variabel uji — digunakan oleh SEMUA model
X_uji <- data_uji %>% select(-Room_Occupancy_Count)
y_uji <- data_uji$Room_Occupancy_Count # faktor dengan level "kelas_0","kelas_1","kelas_2","kelas_3"
cat("Data latih :", nrow(data_latih), "observasi\n")## Data latih : 8106 observasi
## Data uji : 2023 observasi
##
## Distribusi kelas — Data Latih:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 368 599 556
##
## Distribusi kelas — Data Uji:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 1645 91 149 138
Mengapa SMOTE?
Kelas 0 (kosong) mencakup ~81% data. Tanpa penanganan, model cenderung mengabaikan kelas minoritas.
SMOTE membangkitkan sampel sintetis untuk kelas 1, 2, dan 3 melalui interpolasi di ruang fitur — hanya diterapkan pada data latih agar data uji tetap mencerminkan kondisi nyata.
set.seed(SEED)
rec_smote <- recipe(Room_Occupancy_Count ~ ., data = data_latih) %>%
step_smote(Room_Occupancy_Count, over_ratio = 1, neighbors = 5, seed = SEED)
data_latih_s <- rec_smote %>%
prep() %>%
bake(new_data = NULL)
# Pastikan level faktor konsisten
data_latih_s$Room_Occupancy_Count <- factor(
data_latih_s$Room_Occupancy_Count,
levels = LEVELS
)
cat("Distribusi sebelum SMOTE:\n")## Distribusi sebelum SMOTE:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 368 599 556
##
## Distribusi setelah SMOTE:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 6583 6583 6583
df_smote_vis <- rbind(
data.frame(Kondisi = "Sebelum SMOTE",
Kelas = as.character(data_latih$Room_Occupancy_Count)),
data.frame(Kondisi = "Setelah SMOTE",
Kelas = as.character(data_latih_s$Room_Occupancy_Count))
)
df_smote_vis$Kondisi <- factor(df_smote_vis$Kondisi,
levels = c("Sebelum SMOTE","Setelah SMOTE"))
ggplot(df_smote_vis, aes(x = Kelas, fill = Kelas)) +
geom_bar(color = "black", width = 0.6) +
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.4, size = 3.8, fontface = "bold") +
facet_wrap(~Kondisi) +
scale_fill_manual(values = c("kelas_0"="#4472C4","kelas_1"="#ED7D31","kelas_2"="#70AD47","kelas_3"="#E74C3C")) +
labs(title = "Distribusi Kelas Sebelum dan Setelah SMOTE",
subtitle = "SMOTE hanya diterapkan pada data latih",
x = "Kelas Penghuni", y = "Frekuensi", fill = "Kelas") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")# Fitur kontinu yang akan distandarisasi
fitur_std <- setdiff(names(data_latih_s),
c("Room_Occupancy_Count", "S6_PIR", "S7_PIR"))
# Hitung mean & sd DARI DATA LATIH saja
mu_lat <- colMeans(data_latih_s[, fitur_std])
sd_lat <- apply(data_latih_s[, fitur_std], 2, sd)
sd_lat[sd_lat == 0] <- 1 # hindari bagi nol
# Terapkan ke data latih
data_latih_std <- data_latih_s
data_latih_std[, fitur_std] <- scale(data_latih_s[, fitur_std],
center = mu_lat, scale = sd_lat)
# Terapkan ke data uji (pakai mu & sd dari latih, bukan dari uji!)
X_uji_std <- X_uji
X_uji_std[, fitur_std] <- scale(X_uji[, fitur_std],
center = mu_lat, scale = sd_lat)
cat("Standardisasi selesai.\n")## Standardisasi selesai.
## Contoh mean fitur latih (5 pertama):
## S1_Temp S2_Temp S3_Temp S4_Temp S1_Light
## 25.788 26.039 25.415 26.031 93.695
# Fungsi terpusat untuk menghitung & menampilkan semua metrik
# Selalu memastikan level prediksi = level aktual = LEVELS
evaluasi <- function(y_aktual, y_pred, nama_model) {
# Paksa kedua vektor menjadi faktor dengan level yang sama persis
y_aktual <- factor(as.character(y_aktual), levels = LEVELS)
y_pred <- factor(as.character(y_pred), levels = LEVELS)
cm <- confusionMatrix(y_pred, y_aktual)
acc <- as.numeric(cm$overall["Accuracy"])
pre <- mean(cm$byClass[, "Precision"], na.rm = TRUE)
rec <- mean(cm$byClass[, "Recall"], na.rm = TRUE)
f1 <- mean(cm$byClass[, "F1"], na.rm = TRUE)
cat(sprintf("\n========== %s ==========\n", nama_model))
cat(sprintf(" Akurasi : %.4f (%.2f%%)\n", acc, acc * 100))
cat(sprintf(" Presisi : %.4f\n", pre))
cat(sprintf(" Recall : %.4f\n", rec))
cat(sprintf(" F1-Score : %.4f\n", f1))
cat("\nConfusion Matrix:\n")
print(cm$table)
# Kembalikan list metrik untuk tabel perbandingan
list(cm = cm, akurasi = acc, presisi = pre, recall = rec, f1 = f1)
}5-Fold Stratified CV digunakan untuk memilih hyperparameter terbaik. Stratified memastikan setiap fold memiliki proporsi kelas yang representatif.
set.seed(SEED)
ctrl_cv <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = multiClassSummary,
savePredictions = "final",
verboseIter = FALSE
)
cat("Cross-validation: 5-Fold Stratified\n")## Cross-validation: 5-Fold Stratified
## Metric tuning : Akurasi
cp (complexity parameter): mengontrol ukuran pohon.
Nilai kecil → pohon lebih dalam (risiko overfit),
nilai besar → pohon lebih sederhana (risiko underfit).
set.seed(SEED)
grid_dt <- expand.grid(cp = c(0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05))
tuning_dt <- train(
Room_Occupancy_Count ~ .,
data = data_latih_s, # data setelah SMOTE, level "kelas_0"–"kelas_3"
method = "rpart",
trControl = ctrl_cv,
tuneGrid = grid_dt,
metric = "Accuracy"
)
cat("Hasil Tuning Decision Tree:\n")## Hasil Tuning Decision Tree:
## cp Accuracy Kappa
## 1 1e-04 0.9958985 0.9945313
## 2 5e-04 0.9933539 0.9911385
## 3 1e-03 0.9879231 0.9838975
## 4 5e-03 0.9761124 0.9681499
## 5 1e-02 0.9544280 0.9392373
## 6 5e-02 0.8867921 0.8490561
##
## CP terbaik: 1e-04
set.seed(SEED)
cp_terbaik <- tuning_dt$bestTune$cp
model_dt <- rpart(
Room_Occupancy_Count ~ .,
data = data_latih_s,
method = "class",
control = rpart.control(cp = cp_terbaik, maxdepth = 15, minsplit = 10)
)
cat("Decision Tree dilatih dengan cp =", cp_terbaik, "\n")## Decision Tree dilatih dengan cp = 1e-04
##
## Classification tree:
## rpart(formula = Room_Occupancy_Count ~ ., data = data_latih_s,
## method = "class", control = rpart.control(cp = cp_terbaik,
## maxdepth = 15, minsplit = 10))
##
## Variables actually used in tree construction:
## [1] S1_Light S1_Sound S1_Temp S2_Light S2_Sound
## [6] S2_Temp S3_Light S3_Sound S3_Temp S4_Light
## [11] S4_Sound S5_CO2 S5_CO2_Slope S7_PIR
##
## Root node error: 19749/26332 = 0.75
##
## n= 26332
##
## CP nsplit rel error xerror xstd
## 1 0.31368677 0 1.0000000 1.0088612 0.00352584
## 2 0.24654413 1 0.6863132 0.6863132 0.00410656
## 3 0.12679123 2 0.4397691 0.4403261 0.00386432
## 4 0.08344726 3 0.3129779 0.3145476 0.00348853
## 5 0.07909261 4 0.2295306 0.2311003 0.00311025
## 6 0.03772343 5 0.1504380 0.1511469 0.00260496
## 7 0.02729252 6 0.1127146 0.1133222 0.00229138
## 8 0.01316522 7 0.0854220 0.0859790 0.00201813
## 9 0.01184870 8 0.0722568 0.0714467 0.00185037
## 10 0.00911439 9 0.0604081 0.0606613 0.00171227
## 11 0.00886121 10 0.0512937 0.0502810 0.00156525
## 12 0.00673452 11 0.0424325 0.0433440 0.00145719
## 13 0.00546863 12 0.0356980 0.0368626 0.00134720
## 14 0.00379766 13 0.0302294 0.0316978 0.00125175
## 15 0.00244738 14 0.0264317 0.0261279 0.00113889
## 16 0.00212669 17 0.0190896 0.0218239 0.00104258
## 17 0.00136716 18 0.0169629 0.0193428 0.00098246
## 18 0.00101271 20 0.0142286 0.0160514 0.00089610
## 19 0.00086080 22 0.0122031 0.0150894 0.00086914
## 20 0.00081017 23 0.0113423 0.0138235 0.00083229
## 21 0.00073421 24 0.0105322 0.0135197 0.00082319
## 22 0.00064138 26 0.0090638 0.0129120 0.00080466
## 23 0.00055699 29 0.0071396 0.0107347 0.00073429
## 24 0.00050635 30 0.0065826 0.0102284 0.00071690
## 25 0.00040508 31 0.0060763 0.0094182 0.00068813
## 26 0.00035445 32 0.0056712 0.0074434 0.00061221
## 27 0.00024305 35 0.0046078 0.0062788 0.00056252
## 28 0.00020254 40 0.0033926 0.0049623 0.00050033
## 29 0.00015191 46 0.0021773 0.0042534 0.00046334
## 30 0.00010127 48 0.0018735 0.0038989 0.00044367
## 31 0.00010000 49 0.0017722 0.0038483 0.00044079
rpart.plot(
model_dt,
type = 4,
extra = 104,
fallen.leaves = TRUE,
main = "Decision Tree - Estimasi Penghuni Ruangan",
box.palette = list("#4472C4", "#ED7D31", "#70AD47", "#FF0000"),
shadow.col = "gray",
cex = 0.7
)pred_dt <- predict(model_dt, X_uji, type = "class")
# pred_dt sudah faktor dengan level LEVELS karena data_latih_s punya level yg sama
hasil_dt <- evaluasi(y_uji, pred_dt, "Decision Tree")##
## ========== Decision Tree ==========
## Akurasi : 0.9931 (99.31%)
## Presisi : 0.9748
## Recall : 0.9743
## F1-Score : 0.9745
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1644 0 0 2
## kelas_1 0 88 4 0
## kelas_2 0 2 143 2
## kelas_3 1 1 2 134
decay (regularisasi L2): mencegah overfitting dengan memberikan penalti pada koefisien yang besar.
set.seed(SEED)
grid_rl <- expand.grid(decay = c(0.0001, 0.001, 0.01, 0.05, 0.1, 0.5))
tuning_rl <- train(
Room_Occupancy_Count ~ .,
data = data_latih_std, # data terstandarisasi, level "kelas_0"–"kelas_3"
method = "multinom",
trControl = ctrl_cv,
tuneGrid = grid_rl,
maxit = 200,
MaxNWts = 5000,
trace = FALSE,
metric = "Accuracy"
)
cat("Hasil Tuning Regresi Logistik:\n")## Hasil Tuning Regresi Logistik:
## decay Accuracy Kappa
## 1 1e-04 0.9899362 0.9865815
## 2 1e-03 0.9896324 0.9861765
## 3 1e-02 0.9885309 0.9847079
## 4 5e-02 0.9880752 0.9841003
## 5 1e-01 0.9876954 0.9835939
## 6 5e-01 0.9864422 0.9819229
##
## Decay terbaik: 1e-04
set.seed(SEED)
decay_terbaik <- tuning_rl$bestTune$decay
suppressMessages(
model_rl <- multinom(
Room_Occupancy_Count ~ .,
data = data_latih_std,
MaxNWts = 5000,
maxit = 200,
trace = FALSE,
decay = decay_terbaik
)
)
cat("Regresi Logistik dilatih dengan decay =", decay_terbaik, "\n")## Regresi Logistik dilatih dengan decay = 1e-04
pred_rl <- predict(model_rl, X_uji_std, type = "class")
# predict() multinom mengembalikan faktor/character dengan level "kelas_0"–"kelas_3"
# fungsi evaluasi akan memaksa ke LEVELS yang benar
hasil_rl <- evaluasi(y_uji, pred_rl, "Regresi Logistik")##
## ========== Regresi Logistik ==========
## Akurasi : 0.9960 (99.60%)
## Presisi : 0.9900
## Recall : 0.9827
## F1-Score : 0.9861
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 2
## kelas_1 0 88 0 0
## kelas_2 0 3 149 3
## kelas_3 0 0 0 133
mtry: jumlah fitur yang dipertimbangkan di setiap pemisahan node.
Nilai kecil → pohon lebih beragam,
nilai besar → tiap pohon lebih kuat tapi lebih berkorelasi satu sama lain.
set.seed(SEED)
grid_rf <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12))
tuning_rf <- train(
Room_Occupancy_Count ~ .,
data = data_latih_s, # data setelah SMOTE, level "kelas_0"–"kelas_3"
method = "rf",
trControl = ctrl_cv,
tuneGrid = grid_rf,
ntree = 100,
metric = "Accuracy"
)
cat("Hasil Tuning Random Forest:\n")## Hasil Tuning Random Forest:
## mtry Accuracy Kappa
## 1 2 0.9994684 0.9992912
## 2 4 0.9994684 0.9992911
## 3 6 0.9995063 0.9993418
## 4 8 0.9995063 0.9993418
## 5 10 0.9992785 0.9990380
## 6 12 0.9992025 0.9989367
##
## mtry terbaik: 6
set.seed(SEED)
mtry_terbaik <- tuning_rf$bestTune$mtry
model_rf <- randomForest(
Room_Occupancy_Count ~ .,
data = data_latih_s,
ntree = 300,
mtry = mtry_terbaik,
importance = TRUE
)
print(model_rf)##
## Call:
## randomForest(formula = Room_Occupancy_Count ~ ., data = data_latih_s, ntree = 300, mtry = mtry_terbaik, importance = TRUE)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 0.05%
## Confusion matrix:
## kelas_0 kelas_1 kelas_2 kelas_3 class.error
## kelas_0 6582 1 0 0 0.0001519064
## kelas_1 0 6582 1 0 0.0001519064
## kelas_2 0 0 6580 3 0.0004557193
## kelas_3 1 0 6 6576 0.0010633450
imp_mat <- importance(model_rf)
df_imp <- data.frame(
Fitur = rownames(imp_mat),
Pentingnya = imp_mat[, "MeanDecreaseAccuracy"]
) %>% arrange(desc(Pentingnya))
ggplot(df_imp, aes(x = reorder(Fitur, Pentingnya), y = Pentingnya,
fill = Pentingnya)) +
geom_col(color = "black", width = 0.7) +
coord_flip() +
scale_fill_gradient(low = "#A9D18E", high = "#1F4E79") +
labs(title = "Pentingnya Fitur — Random Forest",
subtitle = "Berdasarkan Mean Decrease Accuracy",
x = "Variabel Sensor", y = "Mean Decrease Accuracy") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")pred_rf <- predict(model_rf, X_uji, type = "class")
# predict() randomForest mengembalikan faktor dengan level dari data latih ("kelas_0"–"kelas_3")
hasil_rf <- evaluasi(y_uji, pred_rf, "Random Forest")##
## ========== Random Forest ==========
## Akurasi : 0.9975 (99.75%)
## Presisi : 0.9909
## Recall : 0.9904
## F1-Score : 0.9906
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 1
## kelas_1 0 90 2 0
## kelas_2 0 1 146 0
## kelas_3 0 0 1 137
tabel_hasil <- data.frame(
Model = c("Decision Tree", "Regresi Logistik", "Random Forest"),
Hyperparameter = c(
paste0("cp = ", cp_terbaik),
paste0("decay = ", decay_terbaik),
paste0("mtry = ", mtry_terbaik)
),
Akurasi = round(c(hasil_dt$akurasi, hasil_rl$akurasi, hasil_rf$akurasi), 4),
Presisi = round(c(hasil_dt$presisi, hasil_rl$presisi, hasil_rf$presisi), 4),
Recall = round(c(hasil_dt$recall, hasil_rl$recall, hasil_rf$recall), 4),
F1_Score = round(c(hasil_dt$f1, hasil_rl$f1, hasil_rf$f1), 4)
)
print(tabel_hasil)## Model Hyperparameter Akurasi Presisi Recall F1_Score
## 1 Decision Tree cp = 1e-04 0.9931 0.9748 0.9743 0.9745
## 2 Regresi Logistik decay = 1e-04 0.9960 0.9900 0.9827 0.9861
## 3 Random Forest mtry = 6 0.9975 0.9909 0.9904 0.9906
df_plot <- melt(tabel_hasil[, c("Model","Akurasi","Presisi","Recall","F1_Score")],
id.vars = "Model", variable.name = "Metrik", value.name = "Nilai")
ggplot(df_plot, aes(x = Model, y = Nilai, fill = Model)) +
geom_col(color = "black", width = 0.6) +
geom_text(aes(label = sprintf("%.3f", Nilai)),
vjust = -0.4, size = 3.8, fontface = "bold") +
facet_wrap(~Metrik, ncol = 4) +
scale_fill_manual(values = c(
"Decision Tree" = "#4472C4",
"Regresi Logistik" = "#ED7D31",
"Random Forest" = "#70AD47"
)) +
scale_y_continuous(limits = c(0, 1.15),
labels = percent_format(accuracy = 1)) +
labs(title = "Perbandingan Performa Ketiga Model",
subtitle = paste0("SMOTE + Hyperparameter Tuning (5-Fold CV) | Seed = ", SEED),
x = NULL, y = "Nilai Metrik", fill = "Model") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
legend.position = "bottom")terbaik_idx <- which.max(tabel_hasil$F1_Score)
nama_terbaik <- tabel_hasil$Model[terbaik_idx]
cm_terbaik <- switch(nama_terbaik,
"Decision Tree" = hasil_dt$cm,
"Regresi Logistik" = hasil_rl$cm,
"Random Forest" = hasil_rf$cm
)
df_cm <- as.data.frame(cm_terbaik$table)
names(df_cm) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(df_cm, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = Frekuensi), color = "black",
size = 6, fontface = "bold") +
scale_fill_gradient(low = "#EBF5FB", high = "#1F4E79") +
labs(title = paste0("Confusion Matrix — ", nama_terbaik, " (Model Terbaik)"),
subtitle = "Baris = Prediksi | Kolom = Nilai Aktual",
x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Jumlah") +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold"))terbaik <- tabel_hasil %>% arrange(desc(F1_Score)) %>% slice(1)
cat("============================================================\n")## ============================================================
## SEED : 42
## Penanganan Imbalance : SMOTE (K=5)
## Evaluasi Tuning : 5-Fold Stratified CV
## ------------------------------------------------------------
## MODEL TERBAIK : Random Forest
## Hyperparameter Terbaik : mtry = 6
## Akurasi : 0.9975 (99.75%)
## F1-Score : 0.9906
## ============================================================