library(ggplot2)
library(dplyr)
library(tidyr)
library(reshape2)
library(gridExtra)
library(scales)
library(GGally) # Pairplot
library(ggcorrplot) # Heatmap korelasi yang lebih rapi
# install.packages(c("ggplot2","dplyr","tidyr","reshape2",
# "gridExtra","scales","GGally","ggcorrplot"))df <- read.csv("Occupancy_Estimation.csv",
stringsAsFactors = FALSE)
cat("Dimensi :", nrow(df), "baris x", ncol(df), "kolom\n")## Dimensi : 10129 baris x 19 kolom
## Kolom : Date, Time, S1_Temp, S2_Temp, S3_Temp, S4_Temp, S1_Light, S2_Light, S3_Light, S4_Light, S1_Sound, S2_Sound, S3_Sound, S4_Sound, S5_CO2, S5_CO2_Slope, S6_PIR, S7_PIR, Room_Occupancy_Count
## Date Time S1_Temp S2_Temp S3_Temp S4_Temp S1_Light S2_Light
## 1 2017/12/22 10:49:41 24.94 24.75 24.56 25.38 121 34
## 2 2017/12/22 10:50:12 24.94 24.75 24.56 25.44 121 33
## 3 2017/12/22 10:50:42 25.00 24.75 24.50 25.44 121 34
## 4 2017/12/22 10:51:13 25.00 24.75 24.56 25.44 121 34
## 5 2017/12/22 10:51:44 25.00 24.75 24.56 25.44 121 34
## 6 2017/12/22 10:52:14 25.00 24.81 24.56 25.44 121 34
## 7 2017/12/22 10:52:45 25.00 24.75 24.56 25.44 120 34
## 8 2017/12/22 10:53:15 25.00 24.81 24.56 25.44 121 34
## 9 2017/12/22 10:53:46 25.00 24.81 24.56 25.50 122 35
## 10 2017/12/22 10:54:17 25.00 24.81 24.56 25.50 101 34
## S3_Light S4_Light S1_Sound S2_Sound S3_Sound S4_Sound S5_CO2 S5_CO2_Slope
## 1 53 40 0.08 0.19 0.06 0.06 390 0.76923077
## 2 53 40 0.93 0.05 0.06 0.06 390 0.64615385
## 3 53 40 0.43 0.11 0.08 0.06 390 0.51923077
## 4 53 40 0.41 0.10 0.10 0.09 390 0.38846154
## 5 54 40 0.18 0.06 0.06 0.06 390 0.25384615
## 6 54 40 0.13 0.06 0.06 0.07 390 0.16538462
## 7 54 40 1.39 0.32 0.43 0.06 390 0.07692308
## 8 54 41 0.09 0.06 0.09 0.05 390 -0.01153846
## 9 56 43 0.09 0.05 0.06 0.13 390 -0.10000000
## 10 57 43 3.84 0.64 0.48 0.39 390 -0.18846154
## S6_PIR S7_PIR Room_Occupancy_Count
## 1 0 0 1
## 2 0 0 1
## 3 0 0 1
## 4 0 0 1
## 5 0 0 1
## 6 0 0 1
## 7 1 0 1
## 8 0 0 1
## 9 0 0 1
## 10 1 1 1
## 'data.frame': 10129 obs. of 19 variables:
## $ Date : chr "2017/12/22" "2017/12/22" "2017/12/22" "2017/12/22" ...
## $ Time : chr "10:49:41" "10:50:12" "10:50:42" "10:51:13" ...
## $ S1_Temp : num 24.9 24.9 25 25 25 ...
## $ S2_Temp : num 24.8 24.8 24.8 24.8 24.8 ...
## $ S3_Temp : num 24.6 24.6 24.5 24.6 24.6 ...
## $ S4_Temp : num 25.4 25.4 25.4 25.4 25.4 ...
## $ S1_Light : int 121 121 121 121 121 121 120 121 122 101 ...
## $ S2_Light : int 34 33 34 34 34 34 34 34 35 34 ...
## $ S3_Light : int 53 53 53 53 54 54 54 54 56 57 ...
## $ S4_Light : int 40 40 40 40 40 40 40 41 43 43 ...
## $ S1_Sound : num 0.08 0.93 0.43 0.41 0.18 0.13 1.39 0.09 0.09 3.84 ...
## $ S2_Sound : num 0.19 0.05 0.11 0.1 0.06 0.06 0.32 0.06 0.05 0.64 ...
## $ S3_Sound : num 0.06 0.06 0.08 0.1 0.06 0.06 0.43 0.09 0.06 0.48 ...
## $ S4_Sound : num 0.06 0.06 0.06 0.09 0.06 0.07 0.06 0.05 0.13 0.39 ...
## $ S5_CO2 : int 390 390 390 390 390 390 390 390 390 390 ...
## $ S5_CO2_Slope : num 0.769 0.646 0.519 0.388 0.254 ...
## $ S6_PIR : int 0 0 0 0 0 0 1 0 0 1 ...
## $ S7_PIR : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Room_Occupancy_Count: int 1 1 1 1 1 1 1 1 1 1 ...
## Date Time S1_Temp S2_Temp
## Length:10129 Length:10129 Min. :24.94 Min. :24.75
## Class :character Class :character 1st Qu.:25.19 1st Qu.:25.19
## Mode :character Mode :character Median :25.38 Median :25.38
## Mean :25.45 Mean :25.55
## 3rd Qu.:25.63 3rd Qu.:25.63
## Max. :26.38 Max. :29.00
## S3_Temp S4_Temp S1_Light S2_Light
## Min. :24.44 Min. :24.94 Min. : 0.00 Min. : 0.00
## 1st Qu.:24.69 1st Qu.:25.44 1st Qu.: 0.00 1st Qu.: 0.00
## Median :24.94 Median :25.75 Median : 0.00 Median : 0.00
## Mean :25.06 Mean :25.75 Mean : 25.45 Mean : 26.02
## 3rd Qu.:25.38 3rd Qu.:26.00 3rd Qu.: 12.00 3rd Qu.: 14.00
## Max. :26.19 Max. :26.56 Max. :165.00 Max. :258.00
## S3_Light S4_Light S1_Sound S2_Sound
## Min. : 0.00 Min. : 0.00 Min. :0.0600 Min. :0.0400
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:0.0700 1st Qu.:0.0500
## Median : 0.00 Median : 0.00 Median :0.0800 Median :0.0500
## Mean : 34.25 Mean :13.22 Mean :0.1682 Mean :0.1201
## 3rd Qu.: 50.00 3rd Qu.:22.00 3rd Qu.:0.0800 3rd Qu.:0.0600
## Max. :280.00 Max. :74.00 Max. :3.8800 Max. :3.4400
## S3_Sound S4_Sound S5_CO2 S5_CO2_Slope
## Min. :0.0400 Min. :0.0500 Min. : 345.0 Min. :-6.29615
## 1st Qu.:0.0600 1st Qu.:0.0600 1st Qu.: 355.0 1st Qu.:-0.04615
## Median :0.0600 Median :0.0800 Median : 360.0 Median : 0.00000
## Mean :0.1581 Mean :0.1038 Mean : 460.9 Mean :-0.00483
## 3rd Qu.:0.0700 3rd Qu.:0.1000 3rd Qu.: 465.0 3rd Qu.: 0.00000
## Max. :3.6700 Max. :3.4000 Max. :1270.0 Max. : 8.98077
## S6_PIR S7_PIR Room_Occupancy_Count
## Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.09014 Mean :0.07957 Mean :0.3986
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :3.0000
# Hanya kolom numerik (kecuali Date, Time)
num_cols <- df %>% select(-Date, -Time)
tabel_desk <- data.frame(
Variabel = names(num_cols),
Min = round(sapply(num_cols, min, na.rm = TRUE), 3),
Q1 = round(sapply(num_cols, quantile, 0.25, na.rm = TRUE), 3),
Median = round(sapply(num_cols, median, na.rm = TRUE), 3),
Mean = round(sapply(num_cols, mean, na.rm = TRUE), 3),
Q3 = round(sapply(num_cols, quantile, 0.75, na.rm = TRUE), 3),
Max = round(sapply(num_cols, max, na.rm = TRUE), 3),
SD = round(sapply(num_cols, sd, na.rm = TRUE), 3)
)
rownames(tabel_desk) <- NULL
print(tabel_desk)## Variabel Min Q1 Median Mean Q3 Max SD
## 1 S1_Temp 24.940 25.190 25.38 25.454 25.63 26.380 0.351
## 2 S2_Temp 24.750 25.190 25.38 25.546 25.63 29.000 0.586
## 3 S3_Temp 24.440 24.690 24.94 25.057 25.38 26.190 0.427
## 4 S4_Temp 24.940 25.440 25.75 25.754 26.00 26.560 0.356
## 5 S1_Light 0.000 0.000 0.00 25.445 12.00 165.000 51.011
## 6 S2_Light 0.000 0.000 0.00 26.016 14.00 258.000 67.304
## 7 S3_Light 0.000 0.000 0.00 34.248 50.00 280.000 58.401
## 8 S4_Light 0.000 0.000 0.00 13.220 22.00 74.000 19.602
## 9 S1_Sound 0.060 0.070 0.08 0.168 0.08 3.880 0.317
## 10 S2_Sound 0.040 0.050 0.05 0.120 0.06 3.440 0.267
## 11 S3_Sound 0.040 0.060 0.06 0.158 0.07 3.670 0.414
## 12 S4_Sound 0.050 0.060 0.08 0.104 0.10 3.400 0.121
## 13 S5_CO2 345.000 355.000 360.00 460.860 465.00 1270.000 199.965
## 14 S5_CO2_Slope -6.296 -0.046 0.00 -0.005 0.00 8.981 1.165
## 15 S6_PIR 0.000 0.000 0.00 0.090 0.00 1.000 0.286
## 16 S7_PIR 0.000 0.000 0.00 0.080 0.00 1.000 0.271
## 17 Room_Occupancy_Count 0.000 0.000 0.00 0.399 0.00 3.000 0.894
mv <- colSums(is.na(df))
df_mv <- data.frame(Kolom = names(mv), Missing = as.integer(mv)) %>%
mutate(Persen = paste0(round(Missing / nrow(df) * 100, 2), "%"))
print(df_mv)## Kolom Missing Persen
## 1 Date 0 0%
## 2 Time 0 0%
## 3 S1_Temp 0 0%
## 4 S2_Temp 0 0%
## 5 S3_Temp 0 0%
## 6 S4_Temp 0 0%
## 7 S1_Light 0 0%
## 8 S2_Light 0 0%
## 9 S3_Light 0 0%
## 10 S4_Light 0 0%
## 11 S1_Sound 0 0%
## 12 S2_Sound 0 0%
## 13 S3_Sound 0 0%
## 14 S4_Sound 0 0%
## 15 S5_CO2 0 0%
## 16 S5_CO2_Slope 0 0%
## 17 S6_PIR 0 0%
## 18 S7_PIR 0 0%
## 19 Room_Occupancy_Count 0 0%
##
## ✔ Tidak ada nilai hilang dalam dataset.
## Jumlah baris duplikat: 0
## ✔ Tidak ada duplikat.
df_tipe <- data.frame(
Kolom = names(df),
TipeData = sapply(df, class),
ContohNilai = sapply(df, function(x) as.character(x[1]))
)
rownames(df_tipe) <- NULL
print(df_tipe)## Kolom TipeData ContohNilai
## 1 Date character 2017/12/22
## 2 Time character 10:49:41
## 3 S1_Temp numeric 24.94
## 4 S2_Temp numeric 24.75
## 5 S3_Temp numeric 24.56
## 6 S4_Temp numeric 25.38
## 7 S1_Light integer 121
## 8 S2_Light integer 34
## 9 S3_Light integer 53
## 10 S4_Light integer 40
## 11 S1_Sound numeric 0.08
## 12 S2_Sound numeric 0.19
## 13 S3_Sound numeric 0.06
## 14 S4_Sound numeric 0.06
## 15 S5_CO2 integer 390
## 16 S5_CO2_Slope numeric 0.769230769231
## 17 S6_PIR integer 0
## 18 S7_PIR integer 0
## 19 Room_Occupancy_Count integer 1
tbl <- table(df$Room_Occupancy_Count)
df_dist <- data.frame(
Kelas = names(tbl),
Frekuensi = as.integer(tbl),
Persentase = paste0(round(prop.table(tbl) * 100, 2), "%")
)
print(df_dist)## Kelas Frekuensi Persentase
## 1 0 8228 81.23%
## 2 1 459 4.53%
## 3 2 748 7.38%
## 4 3 694 6.85%
ggplot(df, aes(x = factor(Room_Occupancy_Count),
fill = factor(Room_Occupancy_Count))) +
geom_bar(color = "black", width = 0.6) +
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.5, size = 4.5, fontface = "bold") +
scale_fill_manual(
values = c("0"="#4472C4","1"="#ED7D31","2"="#70AD47","3"="#E74C3C"),
labels = c("0 Penghuni","1 Penghuni","2 Penghuni","3 Penghuni")
) +
labs(title = "Distribusi Kelas Penghuni Ruangan",
subtitle = "Kelas 0 mendominasi sebesar 81.23% — terdapat ketidakseimbangan kelas",
x = "Jumlah Penghuni", y = "Frekuensi", fill = "Kelas") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray40"))df_long <- df %>%
select(-Date, -Time) %>%
mutate(Room_Occupancy_Count = factor(Room_Occupancy_Count)) %>%
pivot_longer(-Room_Occupancy_Count, names_to = "Variabel", values_to = "Nilai")
ggplot(df_long, aes(x = Nilai, fill = Variabel)) +
geom_histogram(bins = 40, color = "white", alpha = 0.85) +
facet_wrap(~Variabel, scales = "free", ncol = 4) +
scale_fill_viridis_d(guide = "none") +
labs(title = "Distribusi Setiap Fitur Sensor",
x = "Nilai", y = "Frekuensi") +
theme_minimal(base_size = 10) +
theme(plot.title = element_text(face = "bold"),
strip.text = element_text(face = "bold", size = 9))df_long2 <- df %>%
select(-Date, -Time) %>%
mutate(Room_Occupancy_Count = factor(Room_Occupancy_Count)) %>%
pivot_longer(-Room_Occupancy_Count, names_to = "Variabel", values_to = "Nilai")
ggplot(df_long2, aes(x = Room_Occupancy_Count, y = Nilai,
fill = Room_Occupancy_Count)) +
geom_boxplot(alpha = 0.75, outlier.size = 0.5, outlier.alpha = 0.3) +
facet_wrap(~Variabel, scales = "free_y", ncol = 4) +
scale_fill_manual(
values = c("0"="#4472C4","1"="#ED7D31","2"="#70AD47","3"="#E74C3C")
) +
labs(title = "Distribusi Fitur Sensor per Kelas Penghuni",
x = "Jumlah Penghuni", y = "Nilai", fill = "Kelas") +
theme_minimal(base_size = 10) +
theme(plot.title = element_text(face = "bold"),
strip.text = element_text(face = "bold", size = 9),
legend.position = "bottom")fitur_num <- df %>%
select(S1_Temp, S2_Temp, S3_Temp, S4_Temp,
S1_Light, S2_Light, S3_Light, S4_Light,
S1_Sound, S2_Sound, S3_Sound, S4_Sound,
S5_CO2, S5_CO2_Slope)
mat_kor <- cor(fitur_num)
ggcorrplot(mat_kor,
method = "square",
type = "lower",
lab = TRUE,
lab_size = 3,
colors = c("#D7191C", "white", "#2C7BB6"),
title = "Matriks Korelasi Fitur Sensor",
ggtheme = theme_minimal(base_size = 11))df_kor_target <- df %>%
select(-Date, -Time) %>%
summarise(across(-Room_Occupancy_Count,
~cor(., Room_Occupancy_Count))) %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Korelasi") %>%
arrange(desc(abs(Korelasi)))
ggplot(df_kor_target, aes(x = reorder(Fitur, abs(Korelasi)),
y = Korelasi, fill = Korelasi > 0)) +
geom_col(color = "black", width = 0.7) +
coord_flip() +
scale_fill_manual(values = c("TRUE" = "#2C7BB6", "FALSE" = "#D7191C"),
labels = c("TRUE" = "Positif", "FALSE" = "Negatif")) +
labs(title = "Korelasi Setiap Fitur dengan Room_Occupancy_Count",
x = "Fitur", y = "Koefisien Korelasi Pearson", fill = "Arah") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))df_waktu <- df %>%
mutate(
DateTime = as.POSIXct(paste(Date, Time), format = "%Y/%m/%d %H:%M:%S"),
Jam = as.integer(format(DateTime, "%H")),
Tanggal = as.Date(Date)
)df_jam <- df_waktu %>%
group_by(Jam) %>%
summarise(RataRata = mean(Room_Occupancy_Count),
.groups = "drop")
ggplot(df_jam, aes(x = Jam, y = RataRata)) +
geom_line(color = "#2C7BB6", linewidth = 1.2) +
geom_point(color = "#1F4E79", size = 2.5) +
labs(title = "Rata-rata Penghuni per Jam dalam Sehari",
subtitle = "Puncak aktivitas terlihat pada jam kerja",
x = "Jam", y = "Rata-rata Jumlah Penghuni") +
scale_x_continuous(breaks = 0:23) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))df_hari <- df_waktu %>%
group_by(Tanggal) %>%
summarise(RataRata = mean(Room_Occupancy_Count),
.groups = "drop")
ggplot(df_hari, aes(x = Tanggal, y = RataRata)) +
geom_line(color = "#ED7D31", linewidth = 1) +
geom_point(color = "#C0392B", size = 2) +
labs(title = "Rata-rata Penghuni per Hari",
subtitle = "Selama periode pengamatan (22 Des 2017 – 11 Jan 2018)",
x = "Tanggal", y = "Rata-rata Jumlah Penghuni") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))fitur_plot <- c("S1_Temp","S2_Temp","S3_Temp","S4_Temp",
"S5_CO2","S5_CO2_Slope",
"S1_Light","S2_Light","S3_Light","S4_Light",
"S1_Sound","S2_Sound","S3_Sound","S4_Sound")
df_out <- df %>%
select(all_of(fitur_plot)) %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai")
ggplot(df_out, aes(x = Fitur, y = Nilai, fill = Fitur)) +
geom_boxplot(outlier.color = "red", outlier.size = 0.8, alpha = 0.7) +
facet_wrap(~Fitur, scales = "free", ncol = 7) +
scale_fill_viridis_d(guide = "none") +
labs(title = "Deteksi Outlier — Boxplot per Sensor",
x = NULL, y = "Nilai") +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_blank(),
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"))hitung_outlier <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
sum(x < (Q1 - 1.5 * IQR_val) | x > (Q3 + 1.5 * IQR_val), na.rm = TRUE)
}
df_outlier <- df %>%
select(-Date, -Time, -Room_Occupancy_Count) %>%
summarise(across(everything(), hitung_outlier)) %>%
pivot_longer(everything(), names_to = "Fitur", values_to = "JumlahOutlier") %>%
mutate(Persen = paste0(round(JumlahOutlier / nrow(df) * 100, 2), "%")) %>%
arrange(desc(JumlahOutlier))
print(df_outlier)## # A tibble: 16 × 3
## Fitur JumlahOutlier Persen
## <chr> <int> <chr>
## 1 S5_CO2_Slope 4033 39.82%
## 2 S1_Sound 1772 17.49%
## 3 S1_Light 1716 16.94%
## 4 S2_Sound 1703 16.81%
## 5 S3_Sound 1671 16.5%
## 6 S5_CO2 1657 16.36%
## 7 S2_Light 1017 10.04%
## 8 S3_Light 950 9.38%
## 9 S2_Temp 948 9.36%
## 10 S6_PIR 913 9.01%
## 11 S7_PIR 806 7.96%
## 12 S4_Sound 721 7.12%
## 13 S4_Light 543 5.36%
## 14 S1_Temp 162 1.6%
## 15 S3_Temp 0 0%
## 16 S4_Temp 0 0%
## ============================================================
## RINGKASAN EKSPLORASI DATA
## ============================================================
## Total observasi : 10129
## Total fitur prediktor : 16
## Missing values : 0
## Baris duplikat : 0
##
## Distribusi kelas:
print(data.frame(
Kelas = names(tbl),
Frekuensi = as.integer(tbl),
Persen = paste0(round(prop.table(tbl)*100, 2), "%")
))## Kelas Frekuensi Persen
## 1 0 8228 81.23%
## 2 1 459 4.53%
## 3 2 748 7.38%
## 4 3 694 6.85%
##
## Fitur dengan korelasi tertinggi ke target:
## # A tibble: 5 × 2
## Fitur Korelasi
## <chr> <dbl>
## 1 S1_Light 0.849
## 2 S3_Light 0.793
## 3 S2_Light 0.789
## 4 S1_Temp 0.701
## 5 S7_PIR 0.695
##
## Fitur dengan outlier terbanyak:
## # A tibble: 5 × 3
## Fitur JumlahOutlier Persen
## <chr> <int> <chr>
## 1 S5_CO2_Slope 4033 39.82%
## 2 S1_Sound 1772 17.49%
## 3 S1_Light 1716 16.94%
## 4 S2_Sound 1703 16.81%
## 5 S3_Sound 1671 16.5%
## ============================================================