library(rpart) # Decision Tree
library(rpart.plot) # Visualisasi Decision Tree
library(nnet) # Regresi Logistik Multinomial
library(randomForest) # Random Forest
library(smotefamily) # SMOTE
library(caret) # CV, tuning, confusion matrix
library(pROC) # AUC-ROC
library(ggplot2) # Visualisasi
library(dplyr) # Manipulasi data
library(reshape2) # Melt untuk plot
library(scales) # Format label
# install.packages(c("rpart","rpart.plot","nnet","randomForest",
# "smotefamily","caret","pROC",
# "ggplot2","dplyr","reshape2","scales"))Seed digunakan agar seluruh proses yang melibatkan keacakan (split data, SMOTE, cross-validation, model berbasis pohon) menghasilkan hasil yang sama setiap kali dijalankan ulang.
## Seed aktif: 42
df <- read.csv("Occupancy_Estimation.csv",
stringsAsFactors = FALSE)
cat("Dataset dimuat:", nrow(df), "baris x", ncol(df), "kolom\n")## Dataset dimuat: 10129 baris x 19 kolom
df_bersih <- df %>% select(-Date, -Time)
# Gunakan prefix "kelas_" agar level menjadi valid R variable name
# (caret membutuhkan level faktor yang valid sebagai nama variabel)
LEVELS <- c("kelas_0", "kelas_1", "kelas_2", "kelas_3")
df_bersih$Room_Occupancy_Count <- factor(
paste0("kelas_", df_bersih$Room_Occupancy_Count),
levels = LEVELS
)
cat("Fitur prediktor :", ncol(df_bersih) - 1, "\n")## Fitur prediktor : 16
## Kelas target : kelas_0 kelas_1 kelas_2 kelas_3
set.seed(SEED)
idx_latih <- createDataPartition(df_bersih$Room_Occupancy_Count,
p = 0.8, list = FALSE)
data_latih <- df_bersih[ idx_latih, ]
data_uji <- df_bersih[-idx_latih, ]
# Variabel uji — digunakan oleh SEMUA model
X_uji <- data_uji %>% select(-Room_Occupancy_Count)
y_uji <- data_uji$Room_Occupancy_Count # faktor dengan level "kelas_0","kelas_1","kelas_2","kelas_3"
cat("Data latih :", nrow(data_latih), "observasi\n")## Data latih : 8106 observasi
## Data uji : 2023 observasi
##
## Distribusi kelas — Data Latih:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 368 599 556
##
## Distribusi kelas — Data Uji:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 1645 91 149 138
Mengapa SMOTE?
Kelas 0 (kosong) mencakup ~81% data. Tanpa penanganan, model cenderung mengabaikan kelas minoritas.
SMOTE membangkitkan sampel sintetis untuk kelas 1, 2, dan 3 melalui interpolasi di ruang fitur — hanya diterapkan pada data latih agar data uji tetap mencerminkan kondisi nyata.
set.seed(SEED)
X_latih <- as.data.frame(
lapply(data_latih[, setdiff(names(data_latih), "Room_Occupancy_Count")],
as.numeric)
)
y_latih <- as.character(data_latih$Room_Occupancy_Count)
smote_res <- smotefamily::SMOTE(X_latih, y_latih, K = 5, dup_size = 0)
data_latih_s <- smote_res$data
names(data_latih_s)[ncol(data_latih_s)] <- "Room_Occupancy_Count"
# Pastikan level faktor konsisten dengan LEVELS (kelas_0, kelas_1, kelas_2, kelas_3)
data_latih_s$Room_Occupancy_Count <- factor(
data_latih_s$Room_Occupancy_Count,
levels = LEVELS
)
cat("Distribusi sebelum SMOTE:\n")## Distribusi sebelum SMOTE:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 368 599 556
##
## Distribusi setelah SMOTE:
##
## kelas_0 kelas_1 kelas_2 kelas_3
## 6583 7728 599 556
df_smote_vis <- rbind(
data.frame(Kondisi = "Sebelum SMOTE",
Kelas = as.character(data_latih$Room_Occupancy_Count)),
data.frame(Kondisi = "Setelah SMOTE",
Kelas = as.character(data_latih_s$Room_Occupancy_Count))
)
df_smote_vis$Kondisi <- factor(df_smote_vis$Kondisi,
levels = c("Sebelum SMOTE","Setelah SMOTE"))
ggplot(df_smote_vis, aes(x = Kelas, fill = Kelas)) +
geom_bar(color = "black", width = 0.6) +
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.4, size = 3.8, fontface = "bold") +
facet_wrap(~Kondisi) +
scale_fill_manual(values = c("kelas_0"="#4472C4","kelas_1"="#ED7D31","kelas_2"="#70AD47","kelas_3"="#E74C3C")) +
labs(title = "Distribusi Kelas Sebelum dan Setelah SMOTE",
subtitle = "SMOTE hanya diterapkan pada data latih",
x = "Kelas Penghuni", y = "Frekuensi", fill = "Kelas") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")# Fitur kontinu yang akan distandarisasi
fitur_std <- setdiff(names(data_latih_s),
c("Room_Occupancy_Count", "S6_PIR", "S7_PIR"))
# Hitung mean & sd DARI DATA LATIH saja
mu_lat <- colMeans(data_latih_s[, fitur_std])
sd_lat <- apply(data_latih_s[, fitur_std], 2, sd)
sd_lat[sd_lat == 0] <- 1 # hindari bagi nol
# Terapkan ke data latih
data_latih_std <- data_latih_s
data_latih_std[, fitur_std] <- scale(data_latih_s[, fitur_std],
center = mu_lat, scale = sd_lat)
# Terapkan ke data uji (pakai mu & sd dari latih, bukan dari uji!)
X_uji_std <- X_uji
X_uji_std[, fitur_std] <- scale(X_uji[, fitur_std],
center = mu_lat, scale = sd_lat)
cat("Standardisasi selesai.\n")## Standardisasi selesai.
## Contoh mean fitur latih (5 pertama):
## S1_Temp S2_Temp S3_Temp S4_Temp S1_Light
## 25.593 25.683 25.174 25.916 70.673
# Fungsi terpusat untuk menghitung & menampilkan semua metrik
# Selalu memastikan level prediksi = level aktual = LEVELS
evaluasi <- function(y_aktual, y_pred, nama_model) {
# Paksa kedua vektor menjadi faktor dengan level yang sama persis
y_aktual <- factor(as.character(y_aktual), levels = LEVELS)
y_pred <- factor(as.character(y_pred), levels = LEVELS)
cm <- confusionMatrix(y_pred, y_aktual)
acc <- as.numeric(cm$overall["Accuracy"])
pre <- mean(cm$byClass[, "Precision"], na.rm = TRUE)
rec <- mean(cm$byClass[, "Recall"], na.rm = TRUE)
f1 <- mean(cm$byClass[, "F1"], na.rm = TRUE)
cat(sprintf("\n========== %s ==========\n", nama_model))
cat(sprintf(" Akurasi : %.4f (%.2f%%)\n", acc, acc * 100))
cat(sprintf(" Presisi : %.4f\n", pre))
cat(sprintf(" Recall : %.4f\n", rec))
cat(sprintf(" F1-Score : %.4f\n", f1))
cat("\nConfusion Matrix:\n")
print(cm$table)
# Kembalikan list metrik untuk tabel perbandingan
list(cm = cm, akurasi = acc, presisi = pre, recall = rec, f1 = f1)
}5-Fold Stratified CV digunakan untuk memilih hyperparameter terbaik. Stratified memastikan setiap fold memiliki proporsi kelas yang representatif.
set.seed(SEED)
ctrl_cv <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = multiClassSummary,
savePredictions = "final",
verboseIter = FALSE
)
cat("Cross-validation: 5-Fold Stratified\n")## Cross-validation: 5-Fold Stratified
## Metric tuning : Akurasi
cp (complexity parameter): mengontrol ukuran pohon.
Nilai kecil → pohon lebih dalam (risiko overfit),
nilai besar → pohon lebih sederhana (risiko underfit).
set.seed(SEED)
grid_dt <- expand.grid(cp = c(0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05))
tuning_dt <- train(
Room_Occupancy_Count ~ .,
data = data_latih_s, # data setelah SMOTE, level "kelas_0"–"kelas_3"
method = "rpart",
trControl = ctrl_cv,
tuneGrid = grid_dt,
metric = "Accuracy"
)
cat("Hasil Tuning Decision Tree:\n")## Hasil Tuning Decision Tree:
## cp Accuracy Kappa
## 1 1e-04 0.9967671 0.9942917
## 2 5e-04 0.9964440 0.9937237
## 3 1e-03 0.9961207 0.9931547
## 4 5e-03 0.9905599 0.9833435
## 5 1e-02 0.9905599 0.9833435
## 6 5e-02 0.9542222 0.9180613
##
## CP terbaik: 1e-04
set.seed(SEED)
cp_terbaik <- tuning_dt$bestTune$cp
model_dt <- rpart(
Room_Occupancy_Count ~ .,
data = data_latih_s,
method = "class",
control = rpart.control(cp = cp_terbaik, maxdepth = 15, minsplit = 10)
)
cat("Decision Tree dilatih dengan cp =", cp_terbaik, "\n")## Decision Tree dilatih dengan cp = 1e-04
##
## Classification tree:
## rpart(formula = Room_Occupancy_Count ~ ., data = data_latih_s,
## method = "class", control = rpart.control(cp = cp_terbaik,
## maxdepth = 15, minsplit = 10))
##
## Variables actually used in tree construction:
## [1] S1_Light S1_Sound S1_Temp S2_Light S2_Sound
## [6] S3_Light S3_Temp S4_Sound S4_Temp S5_CO2
## [11] S5_CO2_Slope S7_PIR
##
## Root node error: 7738/15466 = 0.50032
##
## n= 15466
##
## CP nsplit rel error xerror xstd
## 1 0.85073662 0 1.0000000 1.0000000 0.00803582
## 2 0.05880072 1 0.1492634 0.1493926 0.00422650
## 3 0.03075730 2 0.0904627 0.0905919 0.00334317
## 4 0.01705867 3 0.0597054 0.0598346 0.00273881
## 5 0.01576635 4 0.0426467 0.0451021 0.00238686
## 6 0.01227707 5 0.0268803 0.0277850 0.00188170
## 7 0.00297234 6 0.0146033 0.0169294 0.00147285
## 8 0.00206772 7 0.0116309 0.0112432 0.00120200
## 9 0.00168002 8 0.0095632 0.0102094 0.00114570
## 10 0.00129232 9 0.0078832 0.0087878 0.00106333
## 11 0.00064616 10 0.0065909 0.0078832 0.00100734
## 12 0.00047385 12 0.0052985 0.0063324 0.00090319
## 13 0.00025846 17 0.0028431 0.0060739 0.00088463
## 14 0.00012923 21 0.0018093 0.0062032 0.00089396
## 15 0.00010000 23 0.0015508 0.0058155 0.00086565
rpart.plot(
model_dt,
type = 4,
extra = 104,
fallen.leaves = TRUE,
main = "Decision Tree - Estimasi Penghuni Ruangan",
box.palette = list("#4472C4", "#ED7D31", "#70AD47", "#FF0000"),
shadow.col = "gray",
cex = 0.7
)pred_dt <- predict(model_dt, X_uji, type = "class")
# pred_dt sudah faktor dengan level LEVELS karena data_latih_s punya level yg sama
hasil_dt <- evaluasi(y_uji, pred_dt, "Decision Tree")##
## ========== Decision Tree ==========
## Akurasi : 0.9960 (99.60%)
## Presisi : 0.9858
## Recall : 0.9830
## F1-Score : 0.9844
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 1
## kelas_1 0 88 2 0
## kelas_2 0 2 147 2
## kelas_3 0 1 0 135
decay (regularisasi L2): mencegah overfitting dengan memberikan penalti pada koefisien yang besar.
set.seed(SEED)
grid_rl <- expand.grid(decay = c(0.0001, 0.001, 0.01, 0.05, 0.1, 0.5))
tuning_rl <- train(
Room_Occupancy_Count ~ .,
data = data_latih_std, # data terstandarisasi, level "kelas_0"–"kelas_3"
method = "multinom",
trControl = ctrl_cv,
tuneGrid = grid_rl,
maxit = 200,
MaxNWts = 5000,
trace = FALSE,
metric = "Accuracy"
)
cat("Hasil Tuning Regresi Logistik:\n")## Hasil Tuning Regresi Logistik:
## decay Accuracy Kappa
## 1 1e-04 0.9969609 0.9946320
## 2 1e-03 0.9970901 0.9948604
## 3 1e-02 0.9968963 0.9945196
## 4 5e-02 0.9964436 0.9937216
## 5 1e-01 0.9960556 0.9930382
## 6 5e-01 0.9947625 0.9907555
##
## Decay terbaik: 0.001
set.seed(SEED)
decay_terbaik <- tuning_rl$bestTune$decay
suppressMessages(
model_rl <- multinom(
Room_Occupancy_Count ~ .,
data = data_latih_std,
MaxNWts = 5000,
maxit = 200,
trace = FALSE,
decay = decay_terbaik
)
)
cat("Regresi Logistik dilatih dengan decay =", decay_terbaik, "\n")## Regresi Logistik dilatih dengan decay = 0.001
pred_rl <- predict(model_rl, X_uji_std, type = "class")
# predict() multinom mengembalikan faktor/character dengan level "kelas_0"–"kelas_3"
# fungsi evaluasi akan memaksa ke LEVELS yang benar
hasil_rl <- evaluasi(y_uji, pred_rl, "Regresi Logistik")##
## ========== Regresi Logistik ==========
## Akurasi : 0.9951 (99.51%)
## Presisi : 0.9897
## Recall : 0.9800
## F1-Score : 0.9845
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 4
## kelas_1 0 89 0 0
## kelas_2 0 2 149 4
## kelas_3 0 0 0 130
mtry: jumlah fitur yang dipertimbangkan di setiap pemisahan node.
Nilai kecil → pohon lebih beragam,
nilai besar → tiap pohon lebih kuat tapi lebih berkorelasi satu sama lain.
set.seed(SEED)
grid_rf <- expand.grid(mtry = c(2, 4, 6, 8, 10, 12))
tuning_rf <- train(
Room_Occupancy_Count ~ .,
data = data_latih_s, # data setelah SMOTE, level "kelas_0"–"kelas_3"
method = "rf",
trControl = ctrl_cv,
tuneGrid = grid_rf,
ntree = 100,
metric = "Accuracy"
)
cat("Hasil Tuning Random Forest:\n")## Hasil Tuning Random Forest:
## mtry Accuracy Kappa
## 1 2 0.9989009 0.9980593
## 2 4 0.9985776 0.9974880
## 3 6 0.9987068 0.9977165
## 4 8 0.9981251 0.9966894
## 5 10 0.9983190 0.9970318
## 6 12 0.9981250 0.9966890
##
## mtry terbaik: 2
set.seed(SEED)
mtry_terbaik <- tuning_rf$bestTune$mtry
model_rf <- randomForest(
Room_Occupancy_Count ~ .,
data = data_latih_s,
ntree = 300,
mtry = mtry_terbaik,
importance = TRUE
)
print(model_rf)##
## Call:
## randomForest(formula = Room_Occupancy_Count ~ ., data = data_latih_s, ntree = 300, mtry = mtry_terbaik, importance = TRUE)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0.14%
## Confusion matrix:
## kelas_0 kelas_1 kelas_2 kelas_3 class.error
## kelas_0 6581 1 0 1 0.0003038129
## kelas_1 0 7726 2 0 0.0002587992
## kelas_2 0 1 593 5 0.0100166945
## kelas_3 1 0 10 545 0.0197841727
imp_mat <- importance(model_rf)
df_imp <- data.frame(
Fitur = rownames(imp_mat),
Pentingnya = imp_mat[, "MeanDecreaseAccuracy"]
) %>% arrange(desc(Pentingnya))
ggplot(df_imp, aes(x = reorder(Fitur, Pentingnya), y = Pentingnya,
fill = Pentingnya)) +
geom_col(color = "black", width = 0.7) +
coord_flip() +
scale_fill_gradient(low = "#A9D18E", high = "#1F4E79") +
labs(title = "Pentingnya Fitur — Random Forest",
subtitle = "Berdasarkan Mean Decrease Accuracy",
x = "Variabel Sensor", y = "Mean Decrease Accuracy") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"), legend.position = "none")pred_rf <- predict(model_rf, X_uji, type = "class")
# predict() randomForest mengembalikan faktor dengan level dari data latih ("kelas_0"–"kelas_3")
hasil_rf <- evaluasi(y_uji, pred_rf, "Random Forest")##
## ========== Random Forest ==========
## Akurasi : 0.9975 (99.75%)
## Presisi : 0.9911
## Recall : 0.9903
## F1-Score : 0.9906
##
## Confusion Matrix:
## Reference
## Prediction kelas_0 kelas_1 kelas_2 kelas_3
## kelas_0 1645 0 0 1
## kelas_1 0 90 2 0
## kelas_2 0 1 147 1
## kelas_3 0 0 0 136
tabel_hasil <- data.frame(
Model = c("Decision Tree", "Regresi Logistik", "Random Forest"),
Hyperparameter = c(
paste0("cp = ", cp_terbaik),
paste0("decay = ", decay_terbaik),
paste0("mtry = ", mtry_terbaik)
),
Akurasi = round(c(hasil_dt$akurasi, hasil_rl$akurasi, hasil_rf$akurasi), 4),
Presisi = round(c(hasil_dt$presisi, hasil_rl$presisi, hasil_rf$presisi), 4),
Recall = round(c(hasil_dt$recall, hasil_rl$recall, hasil_rf$recall), 4),
F1_Score = round(c(hasil_dt$f1, hasil_rl$f1, hasil_rf$f1), 4)
)
print(tabel_hasil)## Model Hyperparameter Akurasi Presisi Recall F1_Score
## 1 Decision Tree cp = 1e-04 0.9960 0.9858 0.9830 0.9844
## 2 Regresi Logistik decay = 0.001 0.9951 0.9897 0.9800 0.9845
## 3 Random Forest mtry = 2 0.9975 0.9911 0.9903 0.9906
df_plot <- melt(tabel_hasil[, c("Model","Akurasi","Presisi","Recall","F1_Score")],
id.vars = "Model", variable.name = "Metrik", value.name = "Nilai")
ggplot(df_plot, aes(x = Model, y = Nilai, fill = Model)) +
geom_col(color = "black", width = 0.6) +
geom_text(aes(label = sprintf("%.3f", Nilai)),
vjust = -0.4, size = 3.8, fontface = "bold") +
facet_wrap(~Metrik, ncol = 4) +
scale_fill_manual(values = c(
"Decision Tree" = "#4472C4",
"Regresi Logistik" = "#ED7D31",
"Random Forest" = "#70AD47"
)) +
scale_y_continuous(limits = c(0, 1.15),
labels = percent_format(accuracy = 1)) +
labs(title = "Perbandingan Performa Ketiga Model",
subtitle = paste0("SMOTE + Hyperparameter Tuning (5-Fold CV) | Seed = ", SEED),
x = NULL, y = "Nilai Metrik", fill = "Model") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
legend.position = "bottom")terbaik_idx <- which.max(tabel_hasil$F1_Score)
nama_terbaik <- tabel_hasil$Model[terbaik_idx]
cm_terbaik <- switch(nama_terbaik,
"Decision Tree" = hasil_dt$cm,
"Regresi Logistik" = hasil_rl$cm,
"Random Forest" = hasil_rf$cm
)
df_cm <- as.data.frame(cm_terbaik$table)
names(df_cm) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(df_cm, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = Frekuensi), color = "black",
size = 6, fontface = "bold") +
scale_fill_gradient(low = "#EBF5FB", high = "#1F4E79") +
labs(title = paste0("Confusion Matrix — ", nama_terbaik, " (Model Terbaik)"),
subtitle = "Baris = Prediksi | Kolom = Nilai Aktual",
x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Jumlah") +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(face = "bold"))terbaik <- tabel_hasil %>% arrange(desc(F1_Score)) %>% slice(1)
cat("============================================================\n")## ============================================================
## SEED : 42
## Penanganan Imbalance : SMOTE (K=5)
## Evaluasi Tuning : 5-Fold Stratified CV
## ------------------------------------------------------------
## MODEL TERBAIK : Random Forest
## Hyperparameter Terbaik : mtry = 2
## Akurasi : 0.9975 (99.75%)
## F1-Score : 0.9906
## ============================================================