Data Integration & Cleaning

# WiFi Dataset
wifi <- read.csv("C:/Users/L E N O V O/Downloads/wifi.csv")
wifi$time <- ymd_hms(wifi$time)

wifi <- wifi %>%
  filter(Building == " Library ") %>%
  select(time, Associated.Client.Count) %>%
  mutate(time = floor_date(time, "10 minutes")) %>%
  group_by(time) %>%
  summarise(mean_client_count = mean(Associated.Client.Count, na.rm = TRUE), .groups="drop")

# Electricity Dataset
lib1 <- read.csv("C:/Users/L E N O V O/Downloads/library1.csv")
lib2 <- read.csv("C:/Users/L E N O V O/Downloads/library2.csv")
lib3 <- read.csv("C:/Users/L E N O V O/Downloads/library3.csv")

lib1$ts <- ymd_hms(lib1$ts)
lib2$ts <- ymd_hms(lib2$ts)
lib3$ts <- ymd_hms(lib3$ts)

fill_na <- function(x) {
  x[is.na(x)] <- mean(head(x, 144), na.rm = TRUE)
  return(x)
}
lib1$rate <- fill_na(lib1$rate)
lib2$rate <- fill_na(lib2$rate)
lib3$rate <- fill_na(lib3$rate)

electricity <- lib1 %>%
  rename(rate_1 = rate) %>%
  full_join(rename(lib2, rate_2 = rate), by = "ts") %>%
  full_join(rename(lib3, rate_3 = rate), by = "ts") %>%
  mutate(total_electricity = rate_1 + rate_2 + rate_3,
         time = ts)

df <- inner_join(wifi, electricity %>% select(time, total_electricity), by = "time")

Time Series Occupancy dan Electricity

ggplot(df, aes(x = time)) +
  geom_line(aes(y = mean_client_count, color = "Clients")) +
  geom_line(aes(y = total_electricity, color = "Electricity")) +
  labs(x = "Time", y = "Value", title = "Time Series of Occupancy and Electricity") +
  scale_color_manual(values = c("Clients" = "blue", "Electricity" = "red"))

Grafik pertama memperlihatkan pola temporal antara jumlah pengguna WiFi (sebagai proksi occupancy) dan konsumsi listrik di perpustakaan. Terlihat bahwa keduanya menunjukkan siklus harian yang serupa, dengan puncak pada siang hingga sore hari, lalu menurun pada malam hari. Hal ini konsisten dengan perilaku penggunaan gedung, di mana semakin banyak pengguna, konsumsi listrik pun meningkat.

Korelasi Occupancy dan Electricity

ggplot(df, aes(x = mean_client_count, y = total_electricity)) +
  geom_point(alpha = 0.6, color="black") +
  labs(x = "WiFi Clients", y = "Electricity", title = "Occupancy vs Electricity")

Scatter plot menunjukkan adanya hubungan positif yang kuat antara jumlah pengguna WiFi dengan konsumsi energi.
cor_test <- cor.test(df$mean_client_count, df$total_electricity, use="complete.obs")
cor_test
## 
##  Pearson's product-moment correlation
## 
## data:  df$mean_client_count and df$total_electricity
## t = 111.41, df = 3681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8706081 0.8853959
## sample estimates:
##       cor 
## 0.8782117
Nilai korelasi Pearson yang diperoleh sebesar 0.878 (p < 0.001), yang berarti hubungan keduanya sangat kuat dan signifikan.
ggplot(df, aes(x = mean_client_count, y = total_electricity)) +
  geom_point(alpha=0.5, color="steelblue") +
  geom_smooth(method="lm", se=FALSE, color="red", linetype="dashed") +
  labs(title="Occupancy vs Energy Consumption",
       x="WiFi Clients (Occupancy)", y="Total Electricity")
## `geom_smooth()` using formula = 'y ~ x'

Garis regresi linier menegaskan bahwa peningkatan jumlah pengguna cenderung diikuti kenaikan konsumsi energi.

Daily Profile

daily_profile <- df %>%
  mutate(date = as.Date(time),
         hour_minute = format(time, "%H:%M")) %>%
  group_by(date, hour_minute) %>%
  summarise(mean_clients = mean(mean_client_count, na.rm=TRUE),
            total_energy = mean(total_electricity, na.rm=TRUE), .groups="drop")

avg_daily_profile <- daily_profile %>%
  group_by(hour_minute) %>%
  summarise(mean_clients = mean(mean_clients, na.rm=TRUE),
            total_energy = mean(total_energy, na.rm=TRUE), .groups="drop")
ggplot(daily_profile, aes(x = as.POSIXct(hour_minute, format="%H:%M"), 
                          y = mean_clients, group = date)) +
  geom_line(color="grey70", alpha=0.6) +
  geom_line(data = avg_daily_profile, aes(x = as.POSIXct(hour_minute, format="%H:%M"),
                                          y = mean_clients, group=1), 
            color="blue", size=1.2) +
  labs(title="Daily Profile - Occupancy (WiFi)", x="Time of Day", y="Avg Client Count") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Dari rata-rata harian terlihat bahwa jumlah pengguna mulai meningkat sekitar pukul 08.00, mencapai puncaknya sekitar pukul 15.00, lalu menurun perlahan hingga malam hari. Jam puncak (peak hour) yang paling sering terjadi adalah pukul 15.00.
ggplot(daily_profile, aes(x = as.POSIXct(hour_minute, format="%H:%M"), 
                          y = total_energy, group = date)) +
  geom_line(color="grey70", alpha=0.6) +
  geom_line(data = avg_daily_profile, aes(x = as.POSIXct(hour_minute, format="%H:%M"),
                                          y = total_energy, group=1), 
            color="red", size=1.2) +
  labs(title="Daily Profile - Energy Consumption", x="Time of Day", y="Energy") +
  theme_minimal()

Pola konsumsi energi juga meningkat seiring dengan jumlah pengguna, dengan kenaikan tajam pada pagi hingga siang hari dan bertahan tinggi hingga sore. Malam hari konsumsi energi menurun kembali, sejalan dengan penurunan aktivitas pengguna.

Analisis Peak Hour & Mismatch

peak_hour <- df %>%
  group_by(date = as.Date(time), hour = hour(time)) %>%
  summarise(mean_clients = mean(mean_client_count, na.rm=TRUE), .groups="drop") %>%
  slice_max(mean_clients, n=1, with_ties = FALSE)

avg_peak_hour <- peak_hour %>%
  count(hour) %>%
  slice_max(n, n=1)

cat("Peak hour terbanyak terjadi pada jam:", avg_peak_hour$hour, "\n")
## Peak hour terbanyak terjadi pada jam: 15
Jam puncak (peak hour) terbanyak terjadi pada pukul 15.

Analisis Mismatch (Case Energi vs Occupancy Tidak Sinkron)

low_occ <- quantile(df$mean_client_count, 0.25, na.rm=TRUE)
high_energy <- quantile(df$total_electricity, 0.90, na.rm=TRUE)
high_occ <- quantile(df$mean_client_count, 0.90, na.rm=TRUE)
low_energy <- quantile(df$total_electricity, 0.25, na.rm=TRUE)

mismatch_highE_lowO <- df %>%
  filter(mean_client_count <= low_occ & total_electricity >= high_energy)

mismatch_highO_lowE <- df %>%
  filter(mean_client_count >= high_occ & total_electricity <= low_energy)

cat("Jumlah kasus Energi Tinggi - Occupancy Rendah:", nrow(mismatch_highE_lowO), "\n")
## Jumlah kasus Energi Tinggi - Occupancy Rendah: 0
cat("Jumlah kasus Energi Rendah - Occupancy Tinggi:", nrow(mismatch_highO_lowE), "\n")
## Jumlah kasus Energi Rendah - Occupancy Tinggi: 0
Tidak ditemukan kasus mismatch (energi tinggi tapi okupansi rendah, atau sebaliknya).

Weekday VS Weekend

df <- df %>%
  mutate(weekday = wday(time, label = TRUE),
         weekend = ifelse(weekday %in% c("Sat", "Sun"), "Weekend", "Weekday"))
ggplot(df, aes(x = time, y = mean_client_count, color = weekend)) +
  geom_line() +
  facet_wrap(~weekend, scales="free_x", ncol=1) +
  labs(title="Occupancy - Weekday vs Weekend", x="Time", y="Client Count") +
  theme_minimal()

Pada hari kerja (weekday), jumlah pengguna jauh lebih tinggi dan konsisten, dengan pola harian yang jelas. Sedangkan pada akhir pekan (weekend), jumlah pengguna menurun drastis, hanya terlihat lonjakan kecil pada jam-jam tertentu.
ggplot(df, aes(x = time, y = total_electricity, color = weekend)) +
  geom_line() +
  facet_wrap(~weekend, scales="free_x", ncol=1) +
  labs(title="Energy Consumption - Weekday vs Weekend", x="Time", y="Energy") +
  theme_minimal()

Sama halnya, konsumsi listrik di weekday jauh lebih tinggi dibanding weekend. Meski begitu, pola kenaikan siang hari dan penurunan malam hari tetap terlihat pada kedua kategori.
ggplot(df, aes(x=mean_client_count, y=total_electricity, color=weekend)) +
  geom_point(alpha=0.5) +
  labs(title="Occupancy vs Energy - Weekday vs Weekend", 
       x="WiFi Clients", y="Energy Consumption")

Scatter plot weekday vs weekend memperlihatkan pola hubungan serupa (keduanya tetap berkorelasi positif). Namun, pada weekday, kepadatan titik lebih tinggi di area okupansi besar, sedangkan pada weekend lebih tersebar di okupansi rendah hingga menengah.

Daily Profile dengan Weekend vs Weekday

daily_profile_wd <- df %>%
  mutate(date = as.Date(time),
         hour_minute = format(time, "%H:%M"),
         day_type = ifelse(weekdays(date) %in% c("Saturday", "Sunday"), "Weekend", "Weekday")) %>%
  group_by(day_type, date, hour_minute) %>%
  summarise(mean_clients = mean(mean_client_count, na.rm=TRUE),
            total_energy = mean(total_electricity, na.rm=TRUE), .groups="drop")

avg_daily_profile_wd <- daily_profile_wd %>%
  group_by(day_type, hour_minute) %>%
  summarise(mean_clients = mean(mean_clients, na.rm=TRUE),
            total_energy = mean(total_energy, na.rm=TRUE), .groups="drop")

# Plot Occupancy Daily Profile (Weekday vs Weekend)
ggplot(daily_profile_wd, aes(x = as.POSIXct(hour_minute, format="%H:%M"), 
                             y = mean_clients, group = date, color=day_type)) +
  geom_line(alpha=0.4) +
  geom_line(data = avg_daily_profile_wd, 
            aes(x = as.POSIXct(hour_minute, format="%H:%M"),
                y = mean_clients, color=day_type, group=day_type),
            size=1.2) +
  labs(title="Daily Profile - Occupancy (Weekday vs Weekend)", 
       x="Time of Day", y="Avg Client Count") +
  theme_minimal()

Rata-rata jumlah orang di gedung per jam dalam sehari menunjukkan bahwa aktivitas utama terjadi pada hari kerja, sementara akhir pekan lebih tenang dan santai.