knitr::opts_chunk$set(fig.width=15, fig.height=7)

Install Package

#install.packages("janitor")
#install.packages("ggplot2")
#install.packages("dplyr")
#install.packages("lubridate")
#install.packages("tidyr")

Import Library

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(scales)
## Warning: package 'scales' was built under R version 4.4.3

Input Dataset

#================================ Input Dataset ================================
wifi <- read.csv("wifi.csv", stringsAsFactors = FALSE)
library1 <- read.csv("library1.csv", stringsAsFactors = FALSE)
library2 <- read.csv("library2.csv", stringsAsFactors = FALSE)
library3 <- read.csv("library3.csv", stringsAsFactors = FALSE)

#============================== Melihat Dimensi Dataset ========================
cat("Dimensi Wifi.csv = ", dim(wifi), "\n")
## Dimensi Wifi.csv =  1883844 7
cat("Dimensi library1.csv = ", dim(library1), "\n")
## Dimensi library1.csv =  18864 6
cat("Dimensi library2.csv = ", dim(library2), "\n")
## Dimensi library2.csv =  18864 6
cat("Dimensi library3.csv = ", dim(library3), "\n")
## Dimensi library3.csv =  18864 6

1. Filter & Synchronize the Data dan Data Cleaning

PREPROCESSING WIFI (filter dulu ke Library, lalu binning 10 menit)

# parse time (pakai UTC untuk konsistensi)
wifi$time <- ymd_hms(wifi$time, tz = "UTC")
wifi$Building <- trimws(wifi$Building)

# filter Library (case-insensitive)
wifi_library <- wifi %>% filter(tolower(Building) == "library")
cat("Dimensi Wifi.csv setelah filter:", dim(wifi_library), "\n")
## Dimensi Wifi.csv setelah filter: 28652 7
# buat kolom time yang dibulatkan (left-edge) ke interval 10 menit
# floor_date label='left'
wifi_10min <- wifi_library %>%
  clean_names() %>%                          # ubah col names ke snake_case
  mutate(time10 = floor_date(time, "10 minutes")) %>%
  group_by(time10) %>%
  summarise(wifi_clients = mean(associated_client_count, na.rm = TRUE), .groups = "drop") %>%
  arrange(time10)

PREPROCESSING ENERGY

process_meter <- function(df) {
  df <- df %>% mutate(ts = ymd_hms(ts, tz = "UTC")) %>% arrange(ts)
  # imputasi NaN dengan rata-rata 144 observasi pertama (sesuai instruksi)
  avg_first_144 <- mean(df$rate[1:144], na.rm = TRUE)
  df$rate[is.na(df$rate)] <- avg_first_144
  # binning 10 menit (floor_date) lalu rata-rata per bin
  df10 <- df %>%
    mutate(ts10 = floor_date(ts, "10 minutes")) %>%
    group_by(ts10) %>%
    summarise(rate = mean(rate, na.rm = TRUE), .groups = "drop") %>%
    arrange(ts10)
  return(df10)
}

lib1_10 <- process_meter(library1)
lib2_10 <- process_meter(library2)
lib3_10 <- process_meter(library3)

# jumlahkan ketiga meter per waktu (join via bind_rows lalu group_by sum)
energy_10min <- bind_rows(
  lib1_10 %>% rename(time = ts10),
  lib2_10 %>% rename(time = ts10),
  lib3_10 %>% rename(time = ts10)
) %>%
  group_by(time) %>%
  summarise(energy_total = sum(rate, na.rm = TRUE), .groups = "drop") %>%
  arrange(time)

SINKRONISASI (inner join)

data_sync <- inner_join(
  wifi_10min %>% rename(time = time10),
  energy_10min,
  by = "time"
) %>% arrange(time)

cat("Dimensi setelah sinkronisasi:", dim(data_sync), "\n\n")
## Dimensi setelah sinkronisasi: 3683 3
# cek hasil
print(head(data_sync, 10))
## # A tibble: 10 × 3
##    time                wifi_clients energy_total
##    <dttm>                     <dbl>        <dbl>
##  1 2020-02-01 00:00:00        22.8          64.8
##  2 2020-02-01 00:10:00        20.6          97  
##  3 2020-02-01 00:20:00        17.2          96  
##  4 2020-02-01 00:30:00        15.4         100  
##  5 2020-02-01 00:40:00        12.4          89  
##  6 2020-02-01 00:50:00        10.5          91  
##  7 2020-02-01 01:00:00         9.25         90  
##  8 2020-02-01 01:10:00         8.67         92  
##  9 2020-02-01 01:20:00         8.5          87  
## 10 2020-02-01 01:30:00         8.38         83

2. Visualisasi Hubungan Hunian–Energi

A. TIME SERIES PLOTS

#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_sync, aes(x = time)) +
  geom_line(aes(y = wifi_clients, color = "Occupancy (WiFi Clients)")) +
  geom_line(aes(y = energy_total, color = "Energy Consumption (kWh)")) +
  labs(title = "Time Series: Occupancy vs Energy Consumption",
       x = "Time", y = "Value") +
  scale_color_manual(values = c("blue", "red")) +
  theme_minimal()

B. SCATTER PLOT

#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_sync, aes(x = wifi_clients, y = energy_total)) +
  geom_point(alpha = 0.4, color = "darkgreen") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Scatter Plot: Occupancy vs Energy Consumption",
       x = "Average Occupancy (10-min bin)",
       y = "Total Energy Consumption (kWh)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

C. Daily Profile Occupancy (wifi)

# bikin kolom jam-menit sebagai POSIXct
data_daily <- data_sync %>%
  mutate(date = as.Date(time),
         hour_time = as.POSIXct(sprintf("%02d:%02d", hour(time), minute(time)),
                                format = "%H:%M", tz = "UTC"))

# plot daily profile occupancy dengan sumbu jam lebih rapi
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_daily, aes(x = hour_time, y = wifi_clients, group = date)) +
  geom_line(alpha = 0.3, color = "blue") +
  stat_summary(fun = mean, geom = "line", color = "darkblue", size = 1.2) +
  labs(title = "Daily Profiles: Occupancy",
       x = "Hour of Day", y = "WiFi Clients") +
  scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hour") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

D. Daily Profile Energy

# plot semua hari (energy)
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_daily, aes(x = hour_time, y = energy_total, group = date)) +
  geom_line(alpha = 0.3, color = "red") +
  stat_summary(fun = mean, geom = "line", color = "darkred", size = 1.2) +
  labs(title = "Daily Profiles: Energy Consumption",
       x = "Hour of Day", y = "Energy (kWh)") +
  scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hour") +
  theme_minimal()

3. ANALYSIS

1. Peak Hours of Occupancy

# Tambahkan kolom hour dari timestamp
data_sync <- data_sync %>%
  mutate(hour = hour(time))  # ambil jam 0-23 dari kolom time

# Hitung rata-rata per jam
peak_hours <- data_sync %>%
  group_by(hour) %>%
  summarise(avg_clients = mean(wifi_clients, na.rm = TRUE))

# Cari peak hour
peak_hour <- peak_hours$hour[which.max(peak_hours$avg_clients)]
cat(sprintf("Peak occupancy at hour: %02d:00 with avg %.2f clients\n",
            peak_hour, max(peak_hours$avg_clients)))
## Peak occupancy at hour: 15:00 with avg 370.39 clients
# Plot bar chart 0-23
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(peak_hours, aes(x = factor(hour, levels=0:23), y = avg_clients)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_hline(yintercept = mean(peak_hours$avg_clients),
             color = "red", linetype = "dashed") +
  labs(title = "Average WiFi Clients per Hour of Day",
       x = "Hour of Day",
       y = "Average WiFi Clients") +
  theme_minimal()

2. Influence of Occupancy on Energy

# Korelasi Pearson
corr_test <- cor.test(data_sync$wifi_clients, data_sync$energy_total, method = "pearson")
cat("=== Korelasi Occupancy vs Energy ===\n")
## === Korelasi Occupancy vs Energy ===
cat(sprintf("Pearson correlation = %.3f\n", corr_test$estimate))
## Pearson correlation = 0.878
# Regresi linear sederhana
model <- lm(energy_total ~ wifi_clients, data = data_sync)
summary(model)  # menampilkan intercept, slope, R-squared, p-value
## 
## Call:
## lm(formula = energy_total ~ wifi_clients, data = data_sync)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.299 -15.551   1.791  16.244  50.958 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.037e+02  4.943e-01   209.8   <2e-16 ***
## wifi_clients 2.669e-01  2.396e-03   111.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.9 on 3681 degrees of freedom
## Multiple R-squared:  0.7713, Adjusted R-squared:  0.7712 
## F-statistic: 1.241e+04 on 1 and 3681 DF,  p-value: < 2.2e-16
# Cetak persamaan regresi secara manual
intercept <- coef(model)[1]
slope <- coef(model)[2]
cat(sprintf("Persamaan regresi: Energy = %.3f + %.3f * WiFi_clients\n", intercept, slope))
## Persamaan regresi: Energy = 103.696 + 0.267 * WiFi_clients
cat(sprintf("R-squared: %.3f\n", summary(model)$r.squared))
## R-squared: 0.771
cat(sprintf("Adj. R-squared: %.3f\n", summary(model)$adj.r.squared))
## Adj. R-squared: 0.771
cat(sprintf("P-value (wifi_clients): %.3g\n", summary(model)$coefficients[2,4]))
## P-value (wifi_clients): 0
# Scatter plot dengan regresi linear
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_sync, aes(x = wifi_clients, y = energy_total)) +
  geom_point(alpha = 0.3, color = "blue") +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Occupancy vs Energy Consumption with Regression Line",
       x = "WiFi Clients (Occupancy)",
       y = "Energy Consumption (kWh)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

3. Highlight misaligned cases

# Prediksi energi berdasarkan WiFi (menggunakan model linear sebelumnya)
data_sync <- data_sync %>%
  mutate(energy_pred = predict(model, newdata = data_sync))

# Hitung residual
data_sync <- data_sync %>%
  mutate(residual = energy_total - energy_pred)

# Tentukan threshold untuk "besar"
threshold <- 2 * sd(data_sync$residual, na.rm = TRUE)

# Misaligned cases
mismatch <- data_sync %>% filter(abs(residual) > threshold)
high_energy_unexpected <- data_sync %>% filter(residual > threshold)   # energi jauh lebih tinggi
low_energy_unexpected  <- data_sync %>% filter(residual < -threshold)  # energi jauh lebih rendah

cat("Kasus energi terlalu tinggi (WiFi rendah):", nrow(high_energy_unexpected), "\n")
## Kasus energi terlalu tinggi (WiFi rendah): 40
cat("Kasus energi terlalu rendah (WiFi tinggi):", nrow(low_energy_unexpected), "\n")
## Kasus energi terlalu rendah (WiFi tinggi): 71
cat("Jumlah kasus mismatch:", nrow(mismatch), "\n")
## Jumlah kasus mismatch: 111
# Tampilkan beberapa kasus mismatch
head(mismatch)
# --- Visualisasi Time Series ---
# Pivot data untuk plot line
ts_plot <- data_sync %>%
  select(time, wifi_clients, energy_total, energy_pred) %>%
  pivot_longer(cols = c(wifi_clients, energy_total, energy_pred),
               names_to = "variable", values_to = "value")

#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(ts_plot, aes(x = time, y = value, color = variable, linetype = variable)) +
  geom_line(size = 1) +
  # Titik merah untuk high_energy_unexpected
  geom_point(data = high_energy_unexpected, aes(x = time, y = energy_total, shape = "High Unexpected"),
             color = "red", size = 2, inherit.aes = FALSE) +
  # Titik ungu untuk low_energy_unexpected
  geom_point(data = low_energy_unexpected, aes(x = time, y = energy_total, shape = "Low Unexpected"),
             color = "purple", size = 2, inherit.aes = FALSE) +
  scale_color_manual(values = c("blue", "orange", "green")) +
  scale_linetype_manual(values = c("solid", "solid", "dashed")) +
  scale_shape_manual(name = "Unexpected Cases",
                     values = c("High Unexpected" = 16, "Low Unexpected" = 16)) +
  labs(title = "Time Series with Misaligned Cases (Based on Regression Residuals)",
       x = "Time", y = "WiFi Clients / Energy", color = "Variable", linetype = "Variable") +
  scale_x_datetime(date_breaks = "1 day", date_labels = "%b %d") +  # tampil per hari
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

# --- Scatter Plot dengan highlight mismatch ---
#windows(width=18, height=6) #-> Tampilkan juga di window terpisah
ggplot(data_sync, aes(x = wifi_clients, y = energy_total)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_point(data = mismatch, aes(x = wifi_clients, y = energy_total),
             color = "red", size = 2) +
  labs(title = "Occupancy vs Energy with Mismatches Highlighted",
       x = "WiFi Clients (Occupancy)",
       y = "Energy Consumption (kWh)") +
  theme_minimal()

4. Perbandingan Akhir Pekan vs Hari Kerja

# Tambahkan kolom dayofweek (0 = Senin, 6 = Minggu)
data_sync <- data_sync %>%
  mutate(dayofweek = wday(time, week_start = 1) - 1)  # 0=Mon ... 6=Sun

# Pisahkan weekday (0-4) dan weekend (5-6)
weekday_data <- data_sync %>% filter(dayofweek < 5)
weekend_data <- data_sync %>% filter(dayofweek >= 5)

# --- Plot WiFi Clients ---
#windows(width=18, height=6)
ggplot() +
  geom_line(data = weekday_data, aes(x = time, y = wifi_clients, color = "Weekday WiFi"), alpha = 0.75) +
  geom_line(data = weekend_data, aes(x = time, y = wifi_clients, color = "Weekend WiFi"), alpha = 0.75) +
  labs(title = "WiFi Clients: Weekday vs Weekend",
       x = "Time", y = "Clients", color = "Legend") +
  theme_minimal()

# --- Plot Energy ---
#windows(width=18, height=6)
ggplot() +
  geom_line(data = weekday_data, aes(x = time, y = energy_total, color = "Weekday Energy"), alpha = 0.75) +
  geom_line(data = weekend_data, aes(x = time, y = energy_total, color = "Weekend Energy"), alpha = 0.75) +
  labs(title = "Energy Consumption: Weekday vs Weekend",
       x = "Time", y = "Energy (rate)", color = "Legend") +
  theme_minimal()

# Tambahkan kolom hour
data_sync <- data_sync %>%
  mutate(hour = hour(time))

# Pisahkan weekday & weekend 
weekday_data <- data_sync %>% filter(dayofweek < 5)
weekend_data <- data_sync %>% filter(dayofweek >= 5)

# --- WiFi Clients ---
weekday_wifi <- weekday_data %>%
  group_by(hour) %>%
  summarise(avg_wifi = mean(wifi_clients, na.rm = TRUE))

weekend_wifi <- weekend_data %>%
  group_by(hour) %>%
  summarise(avg_wifi = mean(wifi_clients, na.rm = TRUE))

#windows(width=18, height=6)
ggplot() +
  geom_line(data = weekday_wifi, aes(x = hour, y = avg_wifi, color = "Average WiFi (Weekday)"), size = 1) +
  geom_line(data = weekend_wifi, aes(x = hour, y = avg_wifi, color = "Average WiFi (Weekend)"), size = 1) +
  labs(title = "Daily WiFi Profile: Weekday vs Weekend",
       x = "Hour of Day", y = "Average Clients", color = "Legend") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())

# --- Energy Consumption ---
weekday_energy <- weekday_data %>%
  group_by(hour) %>%
  summarise(avg_energy = mean(energy_total, na.rm = TRUE))

weekend_energy <- weekend_data %>%
  group_by(hour) %>%
  summarise(avg_energy = mean(energy_total, na.rm = TRUE))
  
#windows(width=18, height=6)
ggplot() +
  geom_line(data = weekday_energy, aes(x = hour, y = avg_energy, color = "Average Energy (Weekday)"), size = 1) +
  geom_line(data = weekend_energy, aes(x = hour, y = avg_energy, color = "Average Energy (Weekend)"), size = 1) +
  labs(title = "Daily Energy Profile: Weekday vs Weekend",
       x = "Hour of Day", y = "Average Energy (rate)", color = "Legend") +
  theme_minimal() +
  theme(panel.grid.minor = element_blank())