knitr::opts_chunk$set(fig.width=15, fig.height=7)
Install Package
#install.packages("janitor")
#install.packages("ggplot2")
#install.packages("dplyr")
#install.packages("lubridate")
#install.packages("tidyr")
Import Library
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
1. Filter & Synchronize the Data dan Data Cleaning
PREPROCESSING WIFI (filter dulu ke Library, lalu binning 10
menit)
# parse time (pakai UTC untuk konsistensi)
wifi$time <- ymd_hms(wifi$time, tz = "UTC")
wifi$Building <- trimws(wifi$Building)
# filter Library (case-insensitive)
wifi_library <- wifi %>% filter(tolower(Building) == "library")
cat("Dimensi Wifi.csv setelah filter:", dim(wifi_library), "\n")
## Dimensi Wifi.csv setelah filter: 28652 7
# buat kolom time yang dibulatkan (left-edge) ke interval 10 menit
# floor_date label='left'
wifi_10min <- wifi_library %>%
clean_names() %>% # ubah col names ke snake_case
mutate(time10 = floor_date(time, "10 minutes")) %>%
group_by(time10) %>%
summarise(wifi_clients = mean(associated_client_count, na.rm = TRUE), .groups = "drop") %>%
arrange(time10)
PREPROCESSING ENERGY
process_meter <- function(df) {
df <- df %>% mutate(ts = ymd_hms(ts, tz = "UTC")) %>% arrange(ts)
# imputasi NaN dengan rata-rata 144 observasi pertama (sesuai instruksi)
avg_first_144 <- mean(df$rate[1:144], na.rm = TRUE)
df$rate[is.na(df$rate)] <- avg_first_144
# binning 10 menit (floor_date) lalu rata-rata per bin
df10 <- df %>%
mutate(ts10 = floor_date(ts, "10 minutes")) %>%
group_by(ts10) %>%
summarise(rate = mean(rate, na.rm = TRUE), .groups = "drop") %>%
arrange(ts10)
return(df10)
}
lib1_10 <- process_meter(library1)
lib2_10 <- process_meter(library2)
lib3_10 <- process_meter(library3)
# jumlahkan ketiga meter per waktu (join via bind_rows lalu group_by sum)
energy_10min <- bind_rows(
lib1_10 %>% rename(time = ts10),
lib2_10 %>% rename(time = ts10),
lib3_10 %>% rename(time = ts10)
) %>%
group_by(time) %>%
summarise(energy_total = sum(rate, na.rm = TRUE), .groups = "drop") %>%
arrange(time)
SINKRONISASI (inner join)
data_sync <- inner_join(
wifi_10min %>% rename(time = time10),
energy_10min,
by = "time"
) %>% arrange(time)
cat("Dimensi setelah sinkronisasi:", dim(data_sync), "\n\n")
## Dimensi setelah sinkronisasi: 3683 3
# cek hasil
print(head(data_sync, 10))
## # A tibble: 10 × 3
## time wifi_clients energy_total
## <dttm> <dbl> <dbl>
## 1 2020-02-01 00:00:00 22.8 64.8
## 2 2020-02-01 00:10:00 20.6 97
## 3 2020-02-01 00:20:00 17.2 96
## 4 2020-02-01 00:30:00 15.4 100
## 5 2020-02-01 00:40:00 12.4 89
## 6 2020-02-01 00:50:00 10.5 91
## 7 2020-02-01 01:00:00 9.25 90
## 8 2020-02-01 01:10:00 8.67 92
## 9 2020-02-01 01:20:00 8.5 87
## 10 2020-02-01 01:30:00 8.38 83
2. Visualisasi Hubungan Hunian–Energi
A. TIME SERIES PLOTS
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_sync, aes(x = time)) +
geom_line(aes(y = wifi_clients, color = "Occupancy (WiFi Clients)")) +
geom_line(aes(y = energy_total, color = "Energy Consumption (kWh)")) +
labs(title = "Time Series: Occupancy vs Energy Consumption",
x = "Time", y = "Value") +
scale_color_manual(values = c("blue", "red")) +
theme_minimal()

B. SCATTER PLOT
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_sync, aes(x = wifi_clients, y = energy_total)) +
geom_point(alpha = 0.4, color = "darkgreen") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Scatter Plot: Occupancy vs Energy Consumption",
x = "Average Occupancy (10-min bin)",
y = "Total Energy Consumption (kWh)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

C. Daily Profile Occupancy (wifi)
# bikin kolom jam-menit sebagai POSIXct
data_daily <- data_sync %>%
mutate(date = as.Date(time),
hour_time = as.POSIXct(sprintf("%02d:%02d", hour(time), minute(time)),
format = "%H:%M", tz = "UTC"))
# plot daily profile occupancy dengan sumbu jam lebih rapi
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_daily, aes(x = hour_time, y = wifi_clients, group = date)) +
geom_line(alpha = 0.3, color = "blue") +
stat_summary(fun = mean, geom = "line", color = "darkblue", size = 1.2) +
labs(title = "Daily Profiles: Occupancy",
x = "Hour of Day", y = "WiFi Clients") +
scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hour") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

D. Daily Profile Energy
# plot semua hari (energy)
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_daily, aes(x = hour_time, y = energy_total, group = date)) +
geom_line(alpha = 0.3, color = "red") +
stat_summary(fun = mean, geom = "line", color = "darkred", size = 1.2) +
labs(title = "Daily Profiles: Energy Consumption",
x = "Hour of Day", y = "Energy (kWh)") +
scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hour") +
theme_minimal()

3. ANALYSIS
1. Peak Hours of Occupancy
# Tambahkan kolom hour dari timestamp
data_sync <- data_sync %>%
mutate(hour = hour(time)) # ambil jam 0-23 dari kolom time
# Hitung rata-rata per jam
peak_hours <- data_sync %>%
group_by(hour) %>%
summarise(avg_clients = mean(wifi_clients, na.rm = TRUE))
# Cari peak hour
peak_hour <- peak_hours$hour[which.max(peak_hours$avg_clients)]
cat(sprintf("Peak occupancy at hour: %02d:00 with avg %.2f clients\n",
peak_hour, max(peak_hours$avg_clients)))
## Peak occupancy at hour: 15:00 with avg 370.39 clients
# Plot bar chart 0-23
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(peak_hours, aes(x = factor(hour, levels=0:23), y = avg_clients)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_hline(yintercept = mean(peak_hours$avg_clients),
color = "red", linetype = "dashed") +
labs(title = "Average WiFi Clients per Hour of Day",
x = "Hour of Day",
y = "Average WiFi Clients") +
theme_minimal()

2. Influence of Occupancy on Energy
# Korelasi Pearson
corr_test <- cor.test(data_sync$wifi_clients, data_sync$energy_total, method = "pearson")
cat("=== Korelasi Occupancy vs Energy ===\n")
## === Korelasi Occupancy vs Energy ===
cat(sprintf("Pearson correlation = %.3f\n", corr_test$estimate))
## Pearson correlation = 0.878
# Regresi linear sederhana
model <- lm(energy_total ~ wifi_clients, data = data_sync)
summary(model) # menampilkan intercept, slope, R-squared, p-value
##
## Call:
## lm(formula = energy_total ~ wifi_clients, data = data_sync)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.299 -15.551 1.791 16.244 50.958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.037e+02 4.943e-01 209.8 <2e-16 ***
## wifi_clients 2.669e-01 2.396e-03 111.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.9 on 3681 degrees of freedom
## Multiple R-squared: 0.7713, Adjusted R-squared: 0.7712
## F-statistic: 1.241e+04 on 1 and 3681 DF, p-value: < 2.2e-16
# Cetak persamaan regresi secara manual
intercept <- coef(model)[1]
slope <- coef(model)[2]
cat(sprintf("Persamaan regresi: Energy = %.3f + %.3f * WiFi_clients\n", intercept, slope))
## Persamaan regresi: Energy = 103.696 + 0.267 * WiFi_clients
cat(sprintf("R-squared: %.3f\n", summary(model)$r.squared))
## R-squared: 0.771
cat(sprintf("Adj. R-squared: %.3f\n", summary(model)$adj.r.squared))
## Adj. R-squared: 0.771
cat(sprintf("P-value (wifi_clients): %.3g\n", summary(model)$coefficients[2,4]))
## P-value (wifi_clients): 0
# Scatter plot dengan regresi linear
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(data_sync, aes(x = wifi_clients, y = energy_total)) +
geom_point(alpha = 0.3, color = "blue") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Occupancy vs Energy Consumption with Regression Line",
x = "WiFi Clients (Occupancy)",
y = "Energy Consumption (kWh)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

3. Highlight misaligned cases
# Prediksi energi berdasarkan WiFi (menggunakan model linear sebelumnya)
data_sync <- data_sync %>%
mutate(energy_pred = predict(model, newdata = data_sync))
# Hitung residual
data_sync <- data_sync %>%
mutate(residual = energy_total - energy_pred)
# Tentukan threshold untuk "besar"
threshold <- 2 * sd(data_sync$residual, na.rm = TRUE)
# Misaligned cases
mismatch <- data_sync %>% filter(abs(residual) > threshold)
high_energy_unexpected <- data_sync %>% filter(residual > threshold) # energi jauh lebih tinggi
low_energy_unexpected <- data_sync %>% filter(residual < -threshold) # energi jauh lebih rendah
cat("Kasus energi terlalu tinggi (WiFi rendah):", nrow(high_energy_unexpected), "\n")
## Kasus energi terlalu tinggi (WiFi rendah): 40
cat("Kasus energi terlalu rendah (WiFi tinggi):", nrow(low_energy_unexpected), "\n")
## Kasus energi terlalu rendah (WiFi tinggi): 71
cat("Jumlah kasus mismatch:", nrow(mismatch), "\n")
## Jumlah kasus mismatch: 111
# Tampilkan beberapa kasus mismatch
head(mismatch)
# --- Visualisasi Time Series ---
# Pivot data untuk plot line
ts_plot <- data_sync %>%
select(time, wifi_clients, energy_total, energy_pred) %>%
pivot_longer(cols = c(wifi_clients, energy_total, energy_pred),
names_to = "variable", values_to = "value")
#windows(width=18, height=6) -> Tampilkan juga di window terpisah
ggplot(ts_plot, aes(x = time, y = value, color = variable, linetype = variable)) +
geom_line(size = 1) +
# Titik merah untuk high_energy_unexpected
geom_point(data = high_energy_unexpected, aes(x = time, y = energy_total, shape = "High Unexpected"),
color = "red", size = 2, inherit.aes = FALSE) +
# Titik ungu untuk low_energy_unexpected
geom_point(data = low_energy_unexpected, aes(x = time, y = energy_total, shape = "Low Unexpected"),
color = "purple", size = 2, inherit.aes = FALSE) +
scale_color_manual(values = c("blue", "orange", "green")) +
scale_linetype_manual(values = c("solid", "solid", "dashed")) +
scale_shape_manual(name = "Unexpected Cases",
values = c("High Unexpected" = 16, "Low Unexpected" = 16)) +
labs(title = "Time Series with Misaligned Cases (Based on Regression Residuals)",
x = "Time", y = "WiFi Clients / Energy", color = "Variable", linetype = "Variable") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%b %d") + # tampil per hari
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

# --- Scatter Plot dengan highlight mismatch ---
#windows(width=18, height=6) #-> Tampilkan juga di window terpisah
ggplot(data_sync, aes(x = wifi_clients, y = energy_total)) +
geom_point(alpha = 0.5, color = "blue") +
geom_point(data = mismatch, aes(x = wifi_clients, y = energy_total),
color = "red", size = 2) +
labs(title = "Occupancy vs Energy with Mismatches Highlighted",
x = "WiFi Clients (Occupancy)",
y = "Energy Consumption (kWh)") +
theme_minimal()

4. Perbandingan Akhir Pekan vs Hari Kerja
# Tambahkan kolom dayofweek (0 = Senin, 6 = Minggu)
data_sync <- data_sync %>%
mutate(dayofweek = wday(time, week_start = 1) - 1) # 0=Mon ... 6=Sun
# Pisahkan weekday (0-4) dan weekend (5-6)
weekday_data <- data_sync %>% filter(dayofweek < 5)
weekend_data <- data_sync %>% filter(dayofweek >= 5)
# --- Plot WiFi Clients ---
#windows(width=18, height=6)
ggplot() +
geom_line(data = weekday_data, aes(x = time, y = wifi_clients, color = "Weekday WiFi"), alpha = 0.75) +
geom_line(data = weekend_data, aes(x = time, y = wifi_clients, color = "Weekend WiFi"), alpha = 0.75) +
labs(title = "WiFi Clients: Weekday vs Weekend",
x = "Time", y = "Clients", color = "Legend") +
theme_minimal()

# --- Plot Energy ---
#windows(width=18, height=6)
ggplot() +
geom_line(data = weekday_data, aes(x = time, y = energy_total, color = "Weekday Energy"), alpha = 0.75) +
geom_line(data = weekend_data, aes(x = time, y = energy_total, color = "Weekend Energy"), alpha = 0.75) +
labs(title = "Energy Consumption: Weekday vs Weekend",
x = "Time", y = "Energy (rate)", color = "Legend") +
theme_minimal()

# Tambahkan kolom hour
data_sync <- data_sync %>%
mutate(hour = hour(time))
# Pisahkan weekday & weekend
weekday_data <- data_sync %>% filter(dayofweek < 5)
weekend_data <- data_sync %>% filter(dayofweek >= 5)
# --- WiFi Clients ---
weekday_wifi <- weekday_data %>%
group_by(hour) %>%
summarise(avg_wifi = mean(wifi_clients, na.rm = TRUE))
weekend_wifi <- weekend_data %>%
group_by(hour) %>%
summarise(avg_wifi = mean(wifi_clients, na.rm = TRUE))
#windows(width=18, height=6)
ggplot() +
geom_line(data = weekday_wifi, aes(x = hour, y = avg_wifi, color = "Average WiFi (Weekday)"), size = 1) +
geom_line(data = weekend_wifi, aes(x = hour, y = avg_wifi, color = "Average WiFi (Weekend)"), size = 1) +
labs(title = "Daily WiFi Profile: Weekday vs Weekend",
x = "Hour of Day", y = "Average Clients", color = "Legend") +
theme_minimal() +
theme(panel.grid.minor = element_blank())

# --- Energy Consumption ---
weekday_energy <- weekday_data %>%
group_by(hour) %>%
summarise(avg_energy = mean(energy_total, na.rm = TRUE))
weekend_energy <- weekend_data %>%
group_by(hour) %>%
summarise(avg_energy = mean(energy_total, na.rm = TRUE))
#windows(width=18, height=6)
ggplot() +
geom_line(data = weekday_energy, aes(x = hour, y = avg_energy, color = "Average Energy (Weekday)"), size = 1) +
geom_line(data = weekend_energy, aes(x = hour, y = avg_energy, color = "Average Energy (Weekend)"), size = 1) +
labs(title = "Daily Energy Profile: Weekday vs Weekend",
x = "Hour of Day", y = "Average Energy (rate)", color = "Legend") +
theme_minimal() +
theme(panel.grid.minor = element_blank())
