–>
library(tidyverse)
library(lubridate)
library(zoo)
library(broom)
library(scales)
# Wifi
wifi <- read_csv("C:/Users/ASUS/Downloads/wifi.csv")
# Preview
head(wifi)
## # A tibble: 6 × 7
## time `Event Time` Associated Client Co…¹ Authenticated Client…²
## <dttm> <chr> <dbl> <dbl>
## 1 2020-02-01 00:02:12 Sat Feb 01 … 184 182
## 2 2020-02-01 00:02:12 Sat Feb 01 … 6 6
## 3 2020-02-01 00:02:12 Sat Feb 01 … 18 18
## 4 2020-02-01 00:02:12 Sat Feb 01 … 23 23
## 5 2020-02-01 00:02:12 Sat Feb 01 … 45 45
## 6 2020-02-01 00:02:12 Sat Feb 01 … 16 16
## # ℹ abbreviated names: ¹​`Associated Client Count`,
## # ²​`Authenticated Client Count`
## # ℹ 3 more variables: Uni <chr>, Building <chr>, Floor <chr>
str(wifi)
## spc_tbl_ [1,883,844 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ time : POSIXct[1:1883844], format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
## $ Event Time : chr [1:1883844] "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
## $ Associated Client Count : num [1:1883844] 184 6 18 23 45 16 32 6 73 5 ...
## $ Authenticated Client Count: num [1:1883844] 182 6 18 23 45 16 32 6 73 5 ...
## $ Uni : chr [1:1883844] "Lancaster University" "Lancaster University" "Lancaster University" "Lancaster University" ...
## $ Building : chr [1:1883844] "Graduate College" "Management School" "SW hse 32-33" "SW hse 29" ...
## $ Floor : chr [1:1883844] "A Floor" "C Floor" "D Floor" "B floor" ...
## - attr(*, "spec")=
## .. cols(
## .. time = col_datetime(format = ""),
## .. `Event Time` = col_character(),
## .. `Associated Client Count` = col_double(),
## .. `Authenticated Client Count` = col_double(),
## .. Uni = col_character(),
## .. Building = col_character(),
## .. Floor = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Library energy datasets
lib1 <- read_csv("C:/Users/ASUS/Downloads/library1.csv")
lib2 <- read_csv("C:/Users/ASUS/Downloads/library2.csv")
lib3 <- read_csv("C:/Users/Asus/Downloads/library3.csv")
head(lib1); head(lib2); head(lib3)
## # A tibble: 6 × 6
## ts name reading units cumulative rate
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh 1489442 NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh 1489449 7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh 1489456 7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh 1489464 8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh 1489471 7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh 1489479 8
## # A tibble: 6 × 6
## ts name reading units cumulative rate
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh 2129016 NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh 2129034 18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh 2129054 20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh 2129071 17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh 2129086 15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh 2129103 17
## # A tibble: 6 × 6
## ts name reading units cumulative rate
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh 6914209 NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh 6914266 57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh 6914310 44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh 6914376 66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh 6914439 63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh 6914474 35
str(lib1)
## spc_tbl_ [18,864 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ts : POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
## $ name : chr [1:18864] "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" ...
## $ reading : num [1:18864] 1489442 1489449 1489456 1489464 1489471 ...
## $ units : chr [1:18864] "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num [1:18864] 1489442 1489449 1489456 1489464 1489471 ...
## $ rate : num [1:18864] NA 7 7 8 7 8 7 8 7 8 ...
## - attr(*, "spec")=
## .. cols(
## .. ts = col_datetime(format = ""),
## .. name = col_character(),
## .. reading = col_double(),
## .. units = col_character(),
## .. cumulative = col_double(),
## .. rate = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Convert 'time' to datetime
wifi <- wifi %>% mutate(time = ymd_hms(time))
str(wifi)
## tibble [1,883,844 × 7] (S3: tbl_df/tbl/data.frame)
## $ time : POSIXct[1:1883844], format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
## $ Event Time : chr [1:1883844] "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
## $ Associated Client Count : num [1:1883844] 184 6 18 23 45 16 32 6 73 5 ...
## $ Authenticated Client Count: num [1:1883844] 182 6 18 23 45 16 32 6 73 5 ...
## $ Uni : chr [1:1883844] "Lancaster University" "Lancaster University" "Lancaster University" "Lancaster University" ...
## $ Building : chr [1:1883844] "Graduate College" "Management School" "SW hse 32-33" "SW hse 29" ...
## $ Floor : chr [1:1883844] "A Floor" "C Floor" "D Floor" "B floor" ...
# Lihat unique Building
unique(wifi$Building)
## [1] "Graduate College" "Management School"
## [3] "SW hse 32-33" "SW hse 29"
## [5] "Furness outer" "Slaidburn House (LUSU)"
## [7] "SW hse 34" "Bowland hall"
## [9] "Fylde" "SW hse 55-56"
## [11] "SW hse 40-42" "Bowland Twr (Old Bowland Annexe)"
## [13] "Pendle" "Faraday and cTAP"
## [15] "Institute for Advanced Studies" "University House"
## [17] "SW hse 53-54" "Engineering"
## [19] "Field Station" "SW hse 36"
## [21] "Infolab" "SW hse 21-23"
## [23] "LICA" "FU Hse 71-74"
## [25] "Grizedale" "Charles Carter"
## [27] "Furness" "SW hse 43-45"
## [29] "SW hse 12-16" "Bowland Main"
## [31] "SW hse 35" "Human Resources"
## [33] "County" "FY Hse 65-70"
## [35] "Barker House Farm" "SW hse 27-28"
## [37] "Bowland Annexe" "John Creed"
## [39] "grize-res" "SW hse 24-26"
## [41] "SW hse 20" "Ruskin Library"
## [43] "County South/Cartmel" "Library"
## [45] "Conference Centre" "Postgrad Stats (PSC)"
## [47] "Bowland North" "George Fox"
## [49] "Hse 75 77" "Bailrigg House"
## [51] "Sports Centre" "SW hse 30-31"
## [53] "Bowland Ash" "Alex Square"
## [55] "LEC" "MDC"
## [57] "ISS Building" "Chaplaincy Centre"
## [59] "CETAD" "SW hse 50-52"
## [61] "SW hse 17-19" "Central Workshops"
## [63] "SW hse 46-49" "SW hse 39"
## [65] "Science and technology" "Whewell"
## [67] "SW hse 37-38" "Lonsdale College (SW)"
## [69] "Physics" "County Field"
## [71] "Great Hall" "SW hse-158-179"
## [73] "Preschool" "Reception"
## [75] "Hazelrigg" "Energy Centre"
# Filter hanya rekaman Library
wifi <- wifi %>% mutate(Building = str_trim(Building))
wifi1 <- wifi %>% filter(Building == "Library") %>%
select(time, `Associated Client Count`)
# Resample / aggregate ke interval 10 menit (rata-rata Associated Client Count)
wifi1 <- wifi1 %>%
mutate(time = floor_date(time, unit = "10 minutes")) %>%
group_by(time) %>%
summarise(Mean_Client = mean(`Associated Client Count`, na.rm = TRUE)) %>%
ungroup()
# Cek missing dan duplicate
wifi1 %>% summarise_all(~ sum(is.na(.)))
## # A tibble: 1 × 2
## time Mean_Client
## <int> <int>
## 1 0 0
sum(duplicated(wifi1))
## [1] 0
head(wifi1)
## # A tibble: 6 × 2
## time Mean_Client
## <dttm> <dbl>
## 1 2020-02-01 00:00:00 22.8
## 2 2020-02-01 00:10:00 20.6
## 3 2020-02-01 00:20:00 17.2
## 4 2020-02-01 00:30:00 15.4
## 5 2020-02-01 00:40:00 12.4
## 6 2020-02-01 00:50:00 10.5
# Convert 'ts' ke datetime
lib1 <- lib1 %>% mutate(ts = ymd_hms(ts))
lib2 <- lib2 %>% mutate(ts = ymd_hms(ts))
lib3 <- lib3 %>% mutate(ts = ymd_hms(ts))
# Cek persentase missing
for(i in list(lib1, lib2, lib3)){
df <- i
print(100 * sum(is.na(df$rate)) / nrow(df))
}
## [1] 16.15246
## [1] 16.15246
## [1] 16.15246
# Hitung mean dari 144 pengamatan pertama per meter dan isi NA
mean_lib1 <- mean(lib1$rate[1:144], na.rm = TRUE)
mean_lib2 <- mean(lib2$rate[1:144], na.rm = TRUE)
mean_lib3 <- mean(lib3$rate[1:144], na.rm = TRUE)
lib1 <- lib1 %>% mutate(rate = if_else(is.na(rate), mean_lib1, rate))
lib2 <- lib2 %>% mutate(rate = if_else(is.na(rate), mean_lib2, rate))
lib3 <- lib3 %>% mutate(rate = if_else(is.na(rate), mean_lib3, rate))
# Cek lagi missing
for(i in list(lib1, lib2, lib3)){
df <- i
print(100 * sum(is.na(df$rate)) / nrow(df))
}
## [1] 0
## [1] 0
## [1] 0
# Cek duplicate
cat("Duplicates lib1:", sum(duplicated(lib1)), "\n")
## Duplicates lib1: 21
cat("Duplicates lib2:", sum(duplicated(lib2)), "\n")
## Duplicates lib2: 21
cat("Duplicates lib3:", sum(duplicated(lib3)), "\n")
## Duplicates lib3: 21
# Pilih ts & rate lalu gabung pada ts
elec12 <- lib1 %>% select(ts, rate) %>%
rename(rate_1 = rate) %>%
inner_join(lib2 %>% select(ts, rate) %>% rename(rate_2 = rate), by = "ts")
electricity <- elec12 %>%
inner_join(lib3 %>% select(ts, rate) %>% rename(rate_3 = rate), by = "ts") %>%
mutate(total_electricity = rate_1 + rate_2 + rate_3) %>%
rename(time = ts)
# Cek missing & duplicates
sum(is.na(electricity))
## [1] 2248091
sum(duplicated(electricity))
## [1] 2225430
head(electricity)
## # A tibble: 6 × 5
## time rate_1 rate_2 rate_3 total_electricity
## <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 NA 7.38 17.9 39.6 64.8
## 2 NA 7.38 17.9 39 64.3
## 3 NA 7.38 17.9 35 60.3
## 4 NA 7.38 17.9 70 95.3
## 5 NA 7.38 17.9 40 65.3
## 6 NA 7.38 17.9 38 63.3
electricity <- electricity %>%
mutate(time = floor_date(time, unit = "10 minutes")) %>%
group_by(time) %>%
summarise(rate_1 = mean(rate_1, na.rm = TRUE),
rate_2 = mean(rate_2, na.rm = TRUE),
rate_3 = mean(rate_3, na.rm = TRUE)) %>%
ungroup() %>%
mutate(total_electricity = rate_1 + rate_2 + rate_3)
head(electricity)
## # A tibble: 6 × 5
## time rate_1 rate_2 rate_3 total_electricity
## <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 2020-01-01 00:10:00 7 18 57 82
## 2 2020-01-01 00:20:00 7 20 44 71
## 3 2020-01-01 00:30:00 8 17 66 91
## 4 2020-01-01 00:40:00 7 15 63 85
## 5 2020-01-01 00:50:00 8 17 35 60
## 6 2020-01-01 01:00:00 7 17 47 71
# Inner join pada waktu yang sama
data <- inner_join(wifi1, electricity, by = "time")
# Pilih kolom yang relevan
df <- data %>% select(time, Mean_Client, total_electricity)
head(df)
## # A tibble: 6 × 3
## time Mean_Client total_electricity
## <dttm> <dbl> <dbl>
## 1 2020-02-01 00:10:00 20.6 97
## 2 2020-02-01 00:20:00 17.2 96
## 3 2020-02-01 00:30:00 15.4 100
## 4 2020-02-01 00:40:00 12.4 89
## 5 2020-02-01 00:50:00 10.5 91
## 6 2020-02-01 01:00:00 9.25 90
library(ggplot2)
ggplot(df) +
geom_line(aes(x = time, y = Mean_Client, colour = "Occupancy (WiFi)")) +
geom_line(aes(x = time, y = total_electricity, colour = "Electricity")) +
labs(title = "Library Occupancy vs Electricity Consumption (Time Series)",
x = "Time", y = "Counts / Electricity Rate") +
scale_colour_manual(name = "Legend", values = c("Occupancy (WiFi)" = "blue", "Electricity" = "red")) +
theme_minimal()
ggplot(df, aes(x = Mean_Client, y = total_electricity)) +
geom_point(alpha = 0.5) +
labs(title = "Scatter Plot: Occupancy vs Electricity Consumption",
x = "Library Occupancy (Mean Client Count)",
y = "Electricity Consumption (Total Rate)") +
theme_minimal()
## 4.3 Average Daily Profile (24h)
library(tidyverse)
library(lubridate)
df <- df %>%
mutate(hour = hour(time))
daily_profile <- df %>%
group_by(hour) %>%
summarise(
Mean_Client = mean(Mean_Client, na.rm = TRUE),
total_electricity = mean(total_electricity, na.rm = TRUE),
.groups = "drop"
)
p <- ggplot(daily_profile, aes(x = hour)) +
geom_line(aes(y = Mean_Client, colour = "Avg Occupancy"), linewidth = 1) +
geom_line(aes(y = total_electricity, colour = "Avg Electricity"), linewidth = 1) +
scale_x_continuous(
breaks = 0:23,
labels = sprintf("%02d:00", 0:23)
) +
labs(
title = "Average Daily Profile: Occupancy vs Electricity",
x = "Hour of Day",
y = "Average Value"
) +
scale_colour_manual(
name = "",
values = c("Avg Occupancy" = "blue", "Avg Electricity" = "red")
) +
theme_minimal(base_size = 13)
print(p)
library(dplyr)
peak_hours <- df %>%
mutate(hour = hour(time)) %>% # ambil angka jam
group_by(hour) %>%
summarise(avg_client = mean(Mean_Client, na.rm = TRUE),
.groups = "drop") %>%
arrange(desc(avg_client)) %>%
mutate(hour_label = sprintf("%02d:00", hour)) # ubah jadi "15:00"
cat("Peak hours of occupancy (top 5):\n")
## Peak hours of occupancy (top 5):
print(head(peak_hours[, c("hour_label", "avg_client")], 5))
## # A tibble: 5 × 2
## hour_label avg_client
## <chr> <dbl>
## 1 15:00 370.
## 2 14:00 357.
## 3 16:00 346.
## 4 13:00 331.
## 5 12:00 300.
# Peak occupancy pada timestamp spesifik
peak_timestamp <- df %>%
filter(Mean_Client == max(Mean_Client, na.rm = TRUE)) %>%
pull(time)
cat("\nPeak occupancy at specific timestamp:\n")
##
## Peak occupancy at specific timestamp:
print(peak_timestamp)
## [1] "2020-02-25 15:20:00 UTC"
# Siapkan data untuk regresi
df_reg <- df %>% drop_na(Mean_Client, total_electricity)
model <- lm(total_electricity ~ Mean_Client, data = df_reg)
summary(model)
##
## Call:
## lm(formula = total_electricity ~ Mean_Client, data = df_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.17 -15.63 1.72 16.24 51.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.036e+02 4.965e-01 208.6 <2e-16 ***
## Mean_Client 2.673e-01 2.399e-03 111.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.89 on 3655 degrees of freedom
## Multiple R-squared: 0.7726, Adjusted R-squared: 0.7725
## F-statistic: 1.241e+04 on 1 and 3655 DF, p-value: < 2.2e-16
# fitted & residual
df_reg <- df_reg %>%
mutate(fitted = fitted(model), residual = total_electricity - fitted)
threshold <- 2 * sd(df_reg$residual, na.rm = TRUE)
df_reg <- df_reg %>% mutate(unusual = abs(residual) > threshold)
# Plot observed vs fitted time series
p1 <- ggplot(df_reg) +
geom_line(aes(x = time, y = total_electricity, colour = "Observed"), alpha = 0.7) +
geom_line(aes(x = time, y = fitted, colour = "Fitted (Regression)"), linetype = "dashed") +
geom_line(aes(x = time, y = Mean_Client, colour = "Occupancy (WiFi)"), alpha = 0.7) +
scale_colour_manual(values = c("Observed" = "red", "Fitted (Regression)" = "brown", "Occupancy (WiFi)" = "blue")) +
labs(title = "Regresi Linear: Konsumsi Listrik ~ Okupansi", x = "Waktu", y = "Nilai") +
theme_minimal()
print(p1)
# Scatter plot highlight 'unusual'
p2 <- ggplot(df_reg, aes(x = Mean_Client,
y = total_electricity,
colour = unusual)) +
geom_point(alpha = 0.6) +
scale_colour_manual(
name = "Status",
values = c("TRUE" = "red", "FALSE" = "blue"),
labels = c("TRUE" = "Tidak Sejalan", "FALSE" = "Sejalan")
) +
labs(
title = "Okupansi vs Konsumsi Listrik",
subtitle = "Merah = Kasus Tidak Sejalan",
x = "Mean Client (Okupansi)",
y = "Total Electricity"
) +
theme_minimal()
print(p2)
df <- df %>%
mutate(dayofweek = wday(time) - 1, # 0 = Senin .. 6 = Minggu
day_type = if_else(dayofweek < 5, "Weekday", "Weekend"))
# Time series comparison (overlay)
p_ts <- ggplot() +
geom_line(data = df, aes(x = time, y = total_electricity, colour = day_type), alpha = 0.6) +
geom_line(data = df, aes(x = time, y = Mean_Client, colour = day_type), alpha = 0.6, linetype = "dashed") +
labs(title = "Perbandingan Time Series: Weekday vs Weekend",
x = "Waktu", y = "Nilai") +
theme_minimal()
print(p_ts)
# Scatter plot per kategori
p_sc <- ggplot(df, aes(x = Mean_Client, y = total_electricity, colour = day_type)) +
geom_point(alpha = 0.5) +
labs(title = "Scatter Plot Okupansi vs Konsumsi Listrik\nWeekday vs Weekend") +
theme_minimal()
print(p_sc)
# --- Profil harian rata-rata per kategori ---
# 1) Hitung nilai maksimum dari semua Mean_Client & total_electricity
max_y <- df %>%
summarise(
max_occ = max(Mean_Client, na.rm = TRUE),
max_ele = max(total_electricity, na.rm = TRUE)
) %>%
as.numeric() %>%
max()
# 2) Plot per kategori dengan Y limit yang sama
for(cat in unique(df$day_type)){
profile <- df %>%
filter(day_type == cat) %>%
mutate(hour_num = hour(time),
hour_lbl = sprintf("%02d:00", hour_num)) %>%
group_by(hour_num, hour_lbl) %>%
summarise(
Mean_Client = mean(Mean_Client, na.rm = TRUE),
total_electricity = mean(total_electricity, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(hour_num)
p <- ggplot(profile, aes(x = hour_num)) +
geom_line(aes(y = Mean_Client, colour = "Okupansi (WiFi)")) +
geom_line(aes(y = total_electricity, colour = "Konsumsi Listrik")) +
scale_x_continuous(
breaks = 0:23,
labels = sprintf("%02d:00", 0:23)
) +
scale_y_continuous(limits = c(0, max_y)) +
labs(title = paste0("Profil Harian Rata-rata (", cat, ")"),
x = "Jam", y = "Rata-rata") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}