1. Muat Library

library(ggplot2)
library(dplyr)
library(tidyr)
library(reshape2)
library(gridExtra)
library(scales)
library(GGally)      # Pairplot
library(ggcorrplot)  # Heatmap korelasi yang lebih rapi

# install.packages(c("ggplot2","dplyr","tidyr","reshape2",
#                    "gridExtra","scales","GGally","ggcorrplot"))

2. Load Dataset

df <- read.csv("Occupancy_Estimation.csv",
               stringsAsFactors = FALSE)

cat("Dimensi  :", nrow(df), "baris x", ncol(df), "kolom\n")

## Dimensi  : 10129 baris x 19 kolom

cat("Kolom    :", paste(names(df), collapse = ", "), "\n")

## Kolom    : Date, Time, S1_Temp, S2_Temp, S3_Temp, S4_Temp, S1_Light, S2_Light, S3_Light, S4_Light, S1_Sound, S2_Sound, S3_Sound, S4_Sound, S5_CO2, S5_CO2_Slope, S6_PIR, S7_PIR, Room_Occupancy_Count

Tampilan Awal Data

head(df, 10)

##          Date     Time S1_Temp S2_Temp S3_Temp S4_Temp S1_Light S2_Light
## 1  2017/12/22 10:49:41   24.94   24.75   24.56   25.38      121       34
## 2  2017/12/22 10:50:12   24.94   24.75   24.56   25.44      121       33
## 3  2017/12/22 10:50:42   25.00   24.75   24.50   25.44      121       34
## 4  2017/12/22 10:51:13   25.00   24.75   24.56   25.44      121       34
## 5  2017/12/22 10:51:44   25.00   24.75   24.56   25.44      121       34
## 6  2017/12/22 10:52:14   25.00   24.81   24.56   25.44      121       34
## 7  2017/12/22 10:52:45   25.00   24.75   24.56   25.44      120       34
## 8  2017/12/22 10:53:15   25.00   24.81   24.56   25.44      121       34
## 9  2017/12/22 10:53:46   25.00   24.81   24.56   25.50      122       35
## 10 2017/12/22 10:54:17   25.00   24.81   24.56   25.50      101       34
##    S3_Light S4_Light S1_Sound S2_Sound S3_Sound S4_Sound S5_CO2 S5_CO2_Slope
## 1        53       40     0.08     0.19     0.06     0.06    390   0.76923077
## 2        53       40     0.93     0.05     0.06     0.06    390   0.64615385
## 3        53       40     0.43     0.11     0.08     0.06    390   0.51923077
## 4        53       40     0.41     0.10     0.10     0.09    390   0.38846154
## 5        54       40     0.18     0.06     0.06     0.06    390   0.25384615
## 6        54       40     0.13     0.06     0.06     0.07    390   0.16538462
## 7        54       40     1.39     0.32     0.43     0.06    390   0.07692308
## 8        54       41     0.09     0.06     0.09     0.05    390  -0.01153846
## 9        56       43     0.09     0.05     0.06     0.13    390  -0.10000000
## 10       57       43     3.84     0.64     0.48     0.39    390  -0.18846154
##    S6_PIR S7_PIR Room_Occupancy_Count
## 1       0      0                    1
## 2       0      0                    1
## 3       0      0                    1
## 4       0      0                    1
## 5       0      0                    1
## 6       0      0                    1
## 7       1      0                    1
## 8       0      0                    1
## 9       0      0                    1
## 10      1      1                    1

str(df)

## 'data.frame':    10129 obs. of  19 variables:
##  $ Date                : chr  "2017/12/22" "2017/12/22" "2017/12/22" "2017/12/22" ...
##  $ Time                : chr  "10:49:41" "10:50:12" "10:50:42" "10:51:13" ...
##  $ S1_Temp             : num  24.9 24.9 25 25 25 ...
##  $ S2_Temp             : num  24.8 24.8 24.8 24.8 24.8 ...
##  $ S3_Temp             : num  24.6 24.6 24.5 24.6 24.6 ...
##  $ S4_Temp             : num  25.4 25.4 25.4 25.4 25.4 ...
##  $ S1_Light            : int  121 121 121 121 121 121 120 121 122 101 ...
##  $ S2_Light            : int  34 33 34 34 34 34 34 34 35 34 ...
##  $ S3_Light            : int  53 53 53 53 54 54 54 54 56 57 ...
##  $ S4_Light            : int  40 40 40 40 40 40 40 41 43 43 ...
##  $ S1_Sound            : num  0.08 0.93 0.43 0.41 0.18 0.13 1.39 0.09 0.09 3.84 ...
##  $ S2_Sound            : num  0.19 0.05 0.11 0.1 0.06 0.06 0.32 0.06 0.05 0.64 ...
##  $ S3_Sound            : num  0.06 0.06 0.08 0.1 0.06 0.06 0.43 0.09 0.06 0.48 ...
##  $ S4_Sound            : num  0.06 0.06 0.06 0.09 0.06 0.07 0.06 0.05 0.13 0.39 ...
##  $ S5_CO2              : int  390 390 390 390 390 390 390 390 390 390 ...
##  $ S5_CO2_Slope        : num  0.769 0.646 0.519 0.388 0.254 ...
##  $ S6_PIR              : int  0 0 0 0 0 0 1 0 0 1 ...
##  $ S7_PIR              : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Room_Occupancy_Count: int  1 1 1 1 1 1 1 1 1 1 ...

3. Statistik Deskriptif

Ringkasan Umum

summary(df)

##      Date               Time              S1_Temp         S2_Temp     
##  Length:10129       Length:10129       Min.   :24.94   Min.   :24.75  
##  Class :character   Class :character   1st Qu.:25.19   1st Qu.:25.19  
##  Mode  :character   Mode  :character   Median :25.38   Median :25.38  
##                                        Mean   :25.45   Mean   :25.55  
##                                        3rd Qu.:25.63   3rd Qu.:25.63  
##                                        Max.   :26.38   Max.   :29.00  
##     S3_Temp         S4_Temp         S1_Light         S2_Light     
##  Min.   :24.44   Min.   :24.94   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:24.69   1st Qu.:25.44   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :24.94   Median :25.75   Median :  0.00   Median :  0.00  
##  Mean   :25.06   Mean   :25.75   Mean   : 25.45   Mean   : 26.02  
##  3rd Qu.:25.38   3rd Qu.:26.00   3rd Qu.: 12.00   3rd Qu.: 14.00  
##  Max.   :26.19   Max.   :26.56   Max.   :165.00   Max.   :258.00  
##     S3_Light         S4_Light        S1_Sound         S2_Sound     
##  Min.   :  0.00   Min.   : 0.00   Min.   :0.0600   Min.   :0.0400  
##  1st Qu.:  0.00   1st Qu.: 0.00   1st Qu.:0.0700   1st Qu.:0.0500  
##  Median :  0.00   Median : 0.00   Median :0.0800   Median :0.0500  
##  Mean   : 34.25   Mean   :13.22   Mean   :0.1682   Mean   :0.1201  
##  3rd Qu.: 50.00   3rd Qu.:22.00   3rd Qu.:0.0800   3rd Qu.:0.0600  
##  Max.   :280.00   Max.   :74.00   Max.   :3.8800   Max.   :3.4400  
##     S3_Sound         S4_Sound          S5_CO2        S5_CO2_Slope     
##  Min.   :0.0400   Min.   :0.0500   Min.   : 345.0   Min.   :-6.29615  
##  1st Qu.:0.0600   1st Qu.:0.0600   1st Qu.: 355.0   1st Qu.:-0.04615  
##  Median :0.0600   Median :0.0800   Median : 360.0   Median : 0.00000  
##  Mean   :0.1581   Mean   :0.1038   Mean   : 460.9   Mean   :-0.00483  
##  3rd Qu.:0.0700   3rd Qu.:0.1000   3rd Qu.: 465.0   3rd Qu.: 0.00000  
##  Max.   :3.6700   Max.   :3.4000   Max.   :1270.0   Max.   : 8.98077  
##      S6_PIR            S7_PIR        Room_Occupancy_Count
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000      
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000      
##  Median :0.00000   Median :0.00000   Median :0.0000      
##  Mean   :0.09014   Mean   :0.07957   Mean   :0.3986      
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000      
##  Max.   :1.00000   Max.   :1.00000   Max.   :3.0000

Ringkasan Per-Variabel (Tabel Rapi)

# Hanya kolom numerik (kecuali Date, Time)
num_cols <- df %>% select(-Date, -Time)

tabel_desk <- data.frame(
  Variabel = names(num_cols),
  Min      = round(sapply(num_cols, min,  na.rm = TRUE), 3),
  Q1       = round(sapply(num_cols, quantile, 0.25, na.rm = TRUE), 3),
  Median   = round(sapply(num_cols, median, na.rm = TRUE), 3),
  Mean     = round(sapply(num_cols, mean,  na.rm = TRUE), 3),
  Q3       = round(sapply(num_cols, quantile, 0.75, na.rm = TRUE), 3),
  Max      = round(sapply(num_cols, max,  na.rm = TRUE), 3),
  SD       = round(sapply(num_cols, sd,   na.rm = TRUE), 3)
)
rownames(tabel_desk) <- NULL
print(tabel_desk)

##                Variabel     Min      Q1 Median    Mean     Q3      Max      SD
## 1               S1_Temp  24.940  25.190  25.38  25.454  25.63   26.380   0.351
## 2               S2_Temp  24.750  25.190  25.38  25.546  25.63   29.000   0.586
## 3               S3_Temp  24.440  24.690  24.94  25.057  25.38   26.190   0.427
## 4               S4_Temp  24.940  25.440  25.75  25.754  26.00   26.560   0.356
## 5              S1_Light   0.000   0.000   0.00  25.445  12.00  165.000  51.011
## 6              S2_Light   0.000   0.000   0.00  26.016  14.00  258.000  67.304
## 7              S3_Light   0.000   0.000   0.00  34.248  50.00  280.000  58.401
## 8              S4_Light   0.000   0.000   0.00  13.220  22.00   74.000  19.602
## 9              S1_Sound   0.060   0.070   0.08   0.168   0.08    3.880   0.317
## 10             S2_Sound   0.040   0.050   0.05   0.120   0.06    3.440   0.267
## 11             S3_Sound   0.040   0.060   0.06   0.158   0.07    3.670   0.414
## 12             S4_Sound   0.050   0.060   0.08   0.104   0.10    3.400   0.121
## 13               S5_CO2 345.000 355.000 360.00 460.860 465.00 1270.000 199.965
## 14         S5_CO2_Slope  -6.296  -0.046   0.00  -0.005   0.00    8.981   1.165
## 15               S6_PIR   0.000   0.000   0.00   0.090   0.00    1.000   0.286
## 16               S7_PIR   0.000   0.000   0.00   0.080   0.00    1.000   0.271
## 17 Room_Occupancy_Count   0.000   0.000   0.00   0.399   0.00    3.000   0.894

4. Pengecekan Kualitas Data

Missing Values

mv <- colSums(is.na(df))
df_mv <- data.frame(Kolom = names(mv), Missing = as.integer(mv)) %>%
  mutate(Persen = paste0(round(Missing / nrow(df) * 100, 2), "%"))

print(df_mv)

##                   Kolom Missing Persen
## 1                  Date       0     0%
## 2                  Time       0     0%
## 3               S1_Temp       0     0%
## 4               S2_Temp       0     0%
## 5               S3_Temp       0     0%
## 6               S4_Temp       0     0%
## 7              S1_Light       0     0%
## 8              S2_Light       0     0%
## 9              S3_Light       0     0%
## 10             S4_Light       0     0%
## 11             S1_Sound       0     0%
## 12             S2_Sound       0     0%
## 13             S3_Sound       0     0%
## 14             S4_Sound       0     0%
## 15               S5_CO2       0     0%
## 16         S5_CO2_Slope       0     0%
## 17               S6_PIR       0     0%
## 18               S7_PIR       0     0%
## 19 Room_Occupancy_Count       0     0%

if (sum(mv) == 0) {
  cat("\n✔ Tidak ada nilai hilang dalam dataset.\n")
}

## 
## ✔ Tidak ada nilai hilang dalam dataset.

Duplikat

n_dup <- sum(duplicated(df))
cat("Jumlah baris duplikat:", n_dup, "\n")

## Jumlah baris duplikat: 0

if (n_dup == 0) cat("✔ Tidak ada duplikat.\n")

## ✔ Tidak ada duplikat.

Tipe Data

df_tipe <- data.frame(
  Kolom     = names(df),
  TipeData  = sapply(df, class),
  ContohNilai = sapply(df, function(x) as.character(x[1]))
)
rownames(df_tipe) <- NULL
print(df_tipe)

##                   Kolom  TipeData    ContohNilai
## 1                  Date character     2017/12/22
## 2                  Time character       10:49:41
## 3               S1_Temp   numeric          24.94
## 4               S2_Temp   numeric          24.75
## 5               S3_Temp   numeric          24.56
## 6               S4_Temp   numeric          25.38
## 7              S1_Light   integer            121
## 8              S2_Light   integer             34
## 9              S3_Light   integer             53
## 10             S4_Light   integer             40
## 11             S1_Sound   numeric           0.08
## 12             S2_Sound   numeric           0.19
## 13             S3_Sound   numeric           0.06
## 14             S4_Sound   numeric           0.06
## 15               S5_CO2   integer            390
## 16         S5_CO2_Slope   numeric 0.769230769231
## 17               S6_PIR   integer              0
## 18               S7_PIR   integer              0
## 19 Room_Occupancy_Count   integer              1

5. Distribusi Variabel Target

tbl <- table(df$Room_Occupancy_Count)
df_dist <- data.frame(
  Kelas      = names(tbl),
  Frekuensi  = as.integer(tbl),
  Persentase = paste0(round(prop.table(tbl) * 100, 2), "%")
)
print(df_dist)

##   Kelas Frekuensi Persentase
## 1     0      8228     81.23%
## 2     1       459      4.53%
## 3     2       748      7.38%
## 4     3       694      6.85%

ggplot(df, aes(x = factor(Room_Occupancy_Count),
               fill = factor(Room_Occupancy_Count))) +
  geom_bar(color = "black", width = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.5, size = 4.5, fontface = "bold") +
  scale_fill_manual(
    values = c("0"="#4472C4","1"="#ED7D31","2"="#70AD47","3"="#E74C3C"),
    labels = c("0 Penghuni","1 Penghuni","2 Penghuni","3 Penghuni")
  ) +
  labs(title    = "Distribusi Kelas Penghuni Ruangan",
       subtitle = "Kelas 0 mendominasi sebesar 81.23% — terdapat ketidakseimbangan kelas",
       x = "Jumlah Penghuni", y = "Frekuensi", fill = "Kelas") +
  theme_minimal(base_size = 13) +
  theme(plot.title    = element_text(face = "bold"),
        plot.subtitle = element_text(color = "gray40"))

6. Distribusi Fitur Sensor

Histogram Semua Fitur

df_long <- df %>%
  select(-Date, -Time) %>%
  mutate(Room_Occupancy_Count = factor(Room_Occupancy_Count)) %>%
  pivot_longer(-Room_Occupancy_Count, names_to = "Variabel", values_to = "Nilai")

ggplot(df_long, aes(x = Nilai, fill = Variabel)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.85) +
  facet_wrap(~Variabel, scales = "free", ncol = 4) +
  scale_fill_viridis_d(guide = "none") +
  labs(title = "Distribusi Setiap Fitur Sensor",
       x = "Nilai", y = "Frekuensi") +
  theme_minimal(base_size = 10) +
  theme(plot.title  = element_text(face = "bold"),
        strip.text  = element_text(face = "bold", size = 9))

Boxplot Fitur per Kelas Penghuni

df_long2 <- df %>%
  select(-Date, -Time) %>%
  mutate(Room_Occupancy_Count = factor(Room_Occupancy_Count)) %>%
  pivot_longer(-Room_Occupancy_Count, names_to = "Variabel", values_to = "Nilai")

ggplot(df_long2, aes(x = Room_Occupancy_Count, y = Nilai,
                      fill = Room_Occupancy_Count)) +
  geom_boxplot(alpha = 0.75, outlier.size = 0.5, outlier.alpha = 0.3) +
  facet_wrap(~Variabel, scales = "free_y", ncol = 4) +
  scale_fill_manual(
    values = c("0"="#4472C4","1"="#ED7D31","2"="#70AD47","3"="#E74C3C")
  ) +
  labs(title = "Distribusi Fitur Sensor per Kelas Penghuni",
       x = "Jumlah Penghuni", y = "Nilai", fill = "Kelas") +
  theme_minimal(base_size = 10) +
  theme(plot.title = element_text(face = "bold"),
        strip.text = element_text(face = "bold", size = 9),
        legend.position = "bottom")

7. Analisis Korelasi

Matriks Korelasi (Heatmap)

fitur_num <- df %>%
  select(S1_Temp, S2_Temp, S3_Temp, S4_Temp,
         S1_Light, S2_Light, S3_Light, S4_Light,
         S1_Sound, S2_Sound, S3_Sound, S4_Sound,
         S5_CO2, S5_CO2_Slope)

mat_kor <- cor(fitur_num)

ggcorrplot(mat_kor,
           method   = "square",
           type     = "lower",
           lab      = TRUE,
           lab_size = 3,
           colors   = c("#D7191C", "white", "#2C7BB6"),
           title    = "Matriks Korelasi Fitur Sensor",
           ggtheme  = theme_minimal(base_size = 11))

Korelasi Fitur dengan Target

df_kor_target <- df %>%
  select(-Date, -Time) %>%
  summarise(across(-Room_Occupancy_Count,
                   ~cor(., Room_Occupancy_Count))) %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Korelasi") %>%
  arrange(desc(abs(Korelasi)))

ggplot(df_kor_target, aes(x = reorder(Fitur, abs(Korelasi)),
                           y = Korelasi, fill = Korelasi > 0)) +
  geom_col(color = "black", width = 0.7) +
  coord_flip() +
  scale_fill_manual(values = c("TRUE" = "#2C7BB6", "FALSE" = "#D7191C"),
                    labels = c("TRUE" = "Positif", "FALSE" = "Negatif")) +
  labs(title = "Korelasi Setiap Fitur dengan Room_Occupancy_Count",
       x = "Fitur", y = "Koefisien Korelasi Pearson", fill = "Arah") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

8. Analisis Temporal

Tren Penghuni Berdasarkan Waktu

df_waktu <- df %>%
  mutate(
    DateTime = as.POSIXct(paste(Date, Time), format = "%Y/%m/%d %H:%M:%S"),
    Jam      = as.integer(format(DateTime, "%H")),
    Tanggal  = as.Date(Date)
  )

df_jam <- df_waktu %>%
  group_by(Jam) %>%
  summarise(RataRata = mean(Room_Occupancy_Count),
            .groups = "drop")

ggplot(df_jam, aes(x = Jam, y = RataRata)) +
  geom_line(color = "#2C7BB6", linewidth = 1.2) +
  geom_point(color = "#1F4E79", size = 2.5) +
  labs(title    = "Rata-rata Penghuni per Jam dalam Sehari",
       subtitle = "Puncak aktivitas terlihat pada jam kerja",
       x = "Jam", y = "Rata-rata Jumlah Penghuni") +
  scale_x_continuous(breaks = 0:23) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

df_hari <- df_waktu %>%
  group_by(Tanggal) %>%
  summarise(RataRata = mean(Room_Occupancy_Count),
            .groups = "drop")

ggplot(df_hari, aes(x = Tanggal, y = RataRata)) +
  geom_line(color = "#ED7D31", linewidth = 1) +
  geom_point(color = "#C0392B", size = 2) +
  labs(title    = "Rata-rata Penghuni per Hari",
       subtitle = "Selama periode pengamatan (22 Des 2017 – 11 Jan 2018)",
       x = "Tanggal", y = "Rata-rata Jumlah Penghuni") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

9. Deteksi Outlier

Boxplot untuk Setiap Sensor

fitur_plot <- c("S1_Temp","S2_Temp","S3_Temp","S4_Temp",
                "S5_CO2","S5_CO2_Slope",
                "S1_Light","S2_Light","S3_Light","S4_Light",
                "S1_Sound","S2_Sound","S3_Sound","S4_Sound")

df_out <- df %>%
  select(all_of(fitur_plot)) %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai")

ggplot(df_out, aes(x = Fitur, y = Nilai, fill = Fitur)) +
  geom_boxplot(outlier.color = "red", outlier.size = 0.8, alpha = 0.7) +
  facet_wrap(~Fitur, scales = "free", ncol = 7) +
  scale_fill_viridis_d(guide = "none") +
  labs(title = "Deteksi Outlier — Boxplot per Sensor",
       x = NULL, y = "Nilai") +
  theme_minimal(base_size = 9) +
  theme(axis.text.x = element_blank(),
        strip.text  = element_text(face = "bold"),
        plot.title  = element_text(face = "bold"))

Jumlah Outlier per Fitur (Metode IQR)

hitung_outlier <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  sum(x < (Q1 - 1.5 * IQR_val) | x > (Q3 + 1.5 * IQR_val), na.rm = TRUE)
}

df_outlier <- df %>%
  select(-Date, -Time, -Room_Occupancy_Count) %>%
  summarise(across(everything(), hitung_outlier)) %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "JumlahOutlier") %>%
  mutate(Persen = paste0(round(JumlahOutlier / nrow(df) * 100, 2), "%")) %>%
  arrange(desc(JumlahOutlier))

print(df_outlier)

## # A tibble: 16 × 3
##    Fitur        JumlahOutlier Persen
##    <chr>                <int> <chr> 
##  1 S5_CO2_Slope          4033 39.82%
##  2 S1_Sound              1772 17.49%
##  3 S1_Light              1716 16.94%
##  4 S2_Sound              1703 16.81%
##  5 S3_Sound              1671 16.5% 
##  6 S5_CO2                1657 16.36%
##  7 S2_Light              1017 10.04%
##  8 S3_Light               950 9.38% 
##  9 S2_Temp                948 9.36% 
## 10 S6_PIR                 913 9.01% 
## 11 S7_PIR                 806 7.96% 
## 12 S4_Sound               721 7.12% 
## 13 S4_Light               543 5.36% 
## 14 S1_Temp                162 1.6%  
## 15 S3_Temp                  0 0%    
## 16 S4_Temp                  0 0%

10. Ringkasan EDA

cat("============================================================\n")

## ============================================================

cat(" RINGKASAN EKSPLORASI DATA\n")

##  RINGKASAN EKSPLORASI DATA

cat("============================================================\n")

## ============================================================

cat(sprintf("Total observasi          : %d\n", nrow(df)))

## Total observasi          : 10129

cat(sprintf("Total fitur prediktor    : %d\n", ncol(df) - 3))  # kurangi Date,Time,target

## Total fitur prediktor    : 16

cat(sprintf("Missing values           : %d\n", sum(is.na(df))))

## Missing values           : 0

cat(sprintf("Baris duplikat           : %d\n", sum(duplicated(df))))

## Baris duplikat           : 0

cat("\nDistribusi kelas:\n")

## 
## Distribusi kelas:

print(data.frame(
  Kelas     = names(tbl),
  Frekuensi = as.integer(tbl),
  Persen    = paste0(round(prop.table(tbl)*100, 2), "%")
))

##   Kelas Frekuensi Persen
## 1     0      8228 81.23%
## 2     1       459  4.53%
## 3     2       748  7.38%
## 4     3       694  6.85%

cat("\nFitur dengan korelasi tertinggi ke target:\n")

## 
## Fitur dengan korelasi tertinggi ke target:

print(head(df_kor_target, 5))

## # A tibble: 5 × 2
##   Fitur    Korelasi
##   <chr>       <dbl>
## 1 S1_Light    0.849
## 2 S3_Light    0.793
## 3 S2_Light    0.789
## 4 S1_Temp     0.701
## 5 S7_PIR      0.695

cat("\nFitur dengan outlier terbanyak:\n")

## 
## Fitur dengan outlier terbanyak:

print(head(df_outlier, 5))

## # A tibble: 5 × 3
##   Fitur        JumlahOutlier Persen
##   <chr>                <int> <chr> 
## 1 S5_CO2_Slope          4033 39.82%
## 2 S1_Sound              1772 17.49%
## 3 S1_Light              1716 16.94%
## 4 S2_Sound              1703 16.81%
## 5 S3_Sound              1671 16.5%

cat("============================================================\n")

## ============================================================

File 1 — Eksplorasi Data (EDA)

Dataset: Room Occupancy Estimation

Nama Kelompok

2026-04-19

1. Muat Library

2. Load Dataset

Tampilan Awal Data

3. Statistik Deskriptif

Ringkasan Umum

Ringkasan Per-Variabel (Tabel Rapi)

4. Pengecekan Kualitas Data

Missing Values

Duplikat

Tipe Data

5. Distribusi Variabel Target

6. Distribusi Fitur Sensor

Histogram Semua Fitur

Boxplot Fitur per Kelas Penghuni

7. Analisis Korelasi

Matriks Korelasi (Heatmap)

Korelasi Fitur dengan Target

8. Analisis Temporal

Tren Penghuni Berdasarkan Waktu

9. Deteksi Outlier

Boxplot untuk Setiap Sensor

Jumlah Outlier per Fitur (Metode IQR)

10. Ringkasan EDA