library(dplyr)
library(lubridate)
library(stringr)
library(zoo)
library(ggplot2)
# load data
df1 <- read.csv("library1.csv")
df2 <- read.csv("library2.csv")
df3 <- read.csv("library3.csv")
df4 <- read.csv("wifi.csv")
# Menunjukkan 6 data pertama
head(df1)
## ts name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh 1489442 NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh 1489449 7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh 1489456 7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh 1489464 8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh 1489471 7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh 1489479 8
head(df2)
## ts name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh 2129016 NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh 2129034 18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh 2129054 20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh 2129071 17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh 2129086 15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh 2129103 17
head(df3)
## ts name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh 6914209 NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh 6914266 57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh 6914310 44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh 6914376 66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh 6914439 63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh 6914474 35
head(df4)
## time Event.Time Associated.Client.Count
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 184
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 6
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 18
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 23
## 5 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 45
## 6 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 16
## Authenticated.Client.Count Uni Building
## 1 182 Lancaster University Graduate College
## 2 6 Lancaster University Management School
## 3 18 Lancaster University SW hse 32-33
## 4 23 Lancaster University SW hse 29
## 5 45 Lancaster University Furness outer
## 6 16 Lancaster University Slaidburn House (LUSU)
## Floor
## 1 A Floor
## 2 C Floor
## 3 D Floor
## 4 B floor
## 5 C floor
## 6 E floor
# Melihat struktur dari keempat dataset
str(df1)
## 'data.frame': 18864 obs. of 6 variables:
## $ ts : chr "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
## $ name : chr "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" ...
## $ reading : num 1489442 1489449 1489456 1489464 1489471 ...
## $ units : chr "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num 1489442 1489449 1489456 1489464 1489471 ...
## $ rate : num NA 7 7 8 7 8 7 8 7 8 ...
str(df2)
## 'data.frame': 18864 obs. of 6 variables:
## $ ts : chr "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
## $ name : chr "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" ...
## $ reading : num 2129016 2129034 2129054 2129071 2129086 ...
## $ units : chr "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num 2129016 2129034 2129054 2129071 2129086 ...
## $ rate : num NA 18 20 17 15 17 17 17 17 17 ...
str(df3)
## 'data.frame': 18864 obs. of 6 variables:
## $ ts : chr "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
## $ name : chr "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" ...
## $ reading : num 6914209 6914266 6914310 6914376 6914439 ...
## $ units : chr "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num 6914209 6914266 6914310 6914376 6914439 ...
## $ rate : num NA 57 44 66 63 35 47 55 34 41 ...
str(df4)
## 'data.frame': 1883844 obs. of 7 variables:
## $ time : chr "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
## $ Event.Time : chr "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
## $ Associated.Client.Count : int 184 6 18 23 45 16 32 6 73 5 ...
## $ Authenticated.Client.Count: int 182 6 18 23 45 16 32 6 73 5 ...
## $ Uni : chr "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
## $ Building : chr " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
## $ Floor : chr " A Floor" " C Floor" " D Floor" " B floor" ...
# Memastikan kolom ts/time menjadi datetime lalu mengurutkan ts/time
df1 <- df1 %>%
mutate(ts = ymd_hms(ts)) %>%
arrange(ts)
df2 <- df2 %>%
mutate(ts = ymd_hms(ts)) %>%
arrange(ts)
df3 <- df3 %>%
mutate(ts = ymd_hms(ts)) %>%
arrange(ts)
df4 <- df4 %>%
mutate(time = ymd_hms(time)) %>%
arrange(time)
# Filter untuk menggunakan data 'Library' saja pada kolom 'Building'
df4 <- df4 %>%
filter(str_trim(Building) == "Library") # str_trim untuk hapus spasi berlebih pada " Library "
head(df4)
## time Event.Time Associated.Client.Count
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 29
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 0
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 21
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 38
## 5 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020 0
## 6 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020 33
## Authenticated.Client.Count Uni Building Floor
## 1 28 Lancaster University Library A floor
## 2 0 Lancaster University Library LG floor
## 3 20 Lancaster University Library C floor
## 4 37 Lancaster University Library B floor
## 5 0 Lancaster University Library LG floor
## 6 31 Lancaster University Library A floor
# Resample WiFi data (mean dari Associated Client Count) per 10 menit
wifi_resampled <- df4 %>%
group_by(time = floor_date(time, "10 minutes")) %>% # membulatkan waktu ke interval 10 menit
summarise(library_occupancy = mean(`Associated.Client.Count`, na.rm = TRUE)) %>% # data merupakan mean dari interval 10 menit
ungroup()
head(wifi_resampled)
## # A tibble: 6 × 2
## time library_occupancy
## <dttm> <dbl>
## 1 2020-02-01 00:00:00 22.8
## 2 2020-02-01 00:10:00 20.6
## 3 2020-02-01 00:20:00 17.2
## 4 2020-02-01 00:30:00 15.4
## 5 2020-02-01 00:40:00 12.4
## 6 2020-02-01 00:50:00 10.5
# Menggabungkan ketiga library meters menjadi satu dataframe
library_energy <- data.frame(
time = df1$ts,
rate_1 = df1$rate,
rate_2 = df2$rate,
rate_3 = df3$rate
)
head(library_energy)
## time rate_1 rate_2 rate_3
## 1 2020-01-01 00:00:00 NA NA NA
## 2 2020-01-01 00:10:00 7 18 57
## 3 2020-01-01 00:20:00 7 20 44
## 4 2020-01-01 00:30:00 8 17 66
## 5 2020-01-01 00:40:00 7 15 63
## 6 2020-01-01 00:50:00 8 17 35
# Data merging
data <- library_energy %>%
inner_join(wifi_resampled, by = "time") # key join kolom 'time'
head(data)
## time rate_1 rate_2 rate_3 library_occupancy
## 1 2020-02-01 00:00:00 NA NA NA 22.750
## 2 2020-02-01 00:10:00 13 11 73 20.625
## 3 2020-02-01 00:20:00 13 11 72 17.250
## 4 2020-02-01 00:30:00 11 15 74 15.375
## 5 2020-02-01 00:40:00 11 13 65 12.375
## 6 2020-02-01 00:50:00 11 16 64 10.500
colSums(is.na(data))
## time rate_1 rate_2 rate_3
## 0 1 1 1
## library_occupancy
## 0
# Imputasi masing-masing rate dengan mean dari 144 observasi pertama
for (col in c("rate_1", "rate_2", "rate_3")) {
mean_val <- mean(data[[col]][1:144], na.rm = TRUE)
data[[col]][is.na(data[[col]])] <- mean_val
}
head(data)
## time rate_1 rate_2 rate_3 library_occupancy
## 1 2020-02-01 00:00:00 15.83916 24.34965 82.60839 22.750
## 2 2020-02-01 00:10:00 13.00000 11.00000 73.00000 20.625
## 3 2020-02-01 00:20:00 13.00000 11.00000 72.00000 17.250
## 4 2020-02-01 00:30:00 11.00000 15.00000 74.00000 15.375
## 5 2020-02-01 00:40:00 11.00000 13.00000 65.00000 12.375
## 6 2020-02-01 00:50:00 11.00000 16.00000 64.00000 10.500
# Agregasi ketiga library meter
data <- data %>%
mutate(total_rate = rate_1 + rate_2 + rate_3)
head(data)
## time rate_1 rate_2 rate_3 library_occupancy total_rate
## 1 2020-02-01 00:00:00 15.83916 24.34965 82.60839 22.750 122.7972
## 2 2020-02-01 00:10:00 13.00000 11.00000 73.00000 20.625 97.0000
## 3 2020-02-01 00:20:00 13.00000 11.00000 72.00000 17.250 96.0000
## 4 2020-02-01 00:30:00 11.00000 15.00000 74.00000 15.375 100.0000
## 5 2020-02-01 00:40:00 11.00000 13.00000 65.00000 12.375 89.0000
## 6 2020-02-01 00:50:00 11.00000 16.00000 64.00000 10.500 91.0000
sum(duplicated(data))
## [1] 0
# Deteksi outlier menggunakan IQR
find_outliers <- function(vec) {
q1 <- quantile(vec, 0.25, na.rm = TRUE)
q3 <- quantile(vec, 0.75, na.rm = TRUE)
iqr <- q3 - q1
cut_off <- 1.5 * iqr
minimum <- q1 - cut_off
maximum <- q3 + cut_off
outliers <- vec[vec < minimum | vec > maximum]
return(outliers)
}
find_outliers(data$total_rate)
## numeric(0)
find_outliers(data$library_occupancy)
## numeric(0)
# Simpan ke CSV
write.csv(data, "Clean Data.csv", row.names = FALSE)
#stacked time series
library(ggplot2)
ggplot() +
geom_area(data = data, aes(x = time, y = library_occupancy),
fill = "orange", alpha = 0.8) +
geom_area(data = data, aes(x = time, y = total_rate),
fill = "blue", alpha = 0.8) +
labs(
title = "Stacked Time Series: Energy Consumption vs Library Occupancy",
x = "Time",
y = "Value"
) +
theme_minimal()
#Scatter Plot: occupancy (X) vs energy consumption
ggplot(data, aes(x = library_occupancy, y = total_rate)) +
geom_point(alpha = 0.5, color = "blue") +
labs(
title = "Scatter Plot: Occupancy vs Energy Consumption",
x = "Library Occupancy",
y = "Energy Consumption"
) +
theme_minimal()
#daily profiles energy consumption
data %>%
mutate(
hour = as.POSIXct(format(time, "%H:%M"), format = "%H:%M")
) %>%
ggplot(aes(x = hour, y = total_rate, group = as.Date(time))) +
geom_line(color = "gray", alpha = 0.4) +
stat_summary(fun = mean, geom = "line", color = "blue",
aes(group = 1), size = 1.2) +
scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hours") + #biar format hh:mm per 2h
labs(
title = "Daily Profiles: Energy Consumption",
x = "Hour of Day",
y = "Mean Energy Consumption"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #biar label gk numpuk
#library occupancy daily profiles
data %>%
mutate(
hour = as.POSIXct(format(time, "%H:%M"), format = "%H:%M")
) %>%
ggplot(aes(x = hour, y = library_occupancy, group = as.Date(time))) +
geom_line(color = "gray", alpha = 0.4) +
stat_summary(fun = mean, geom = "line", color = "darkgreen",
aes(group = 1), size = 1.2) +
scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hours") + #biar format hh:mm per 2h
labs(
title = "Daily Profiles: Library Occupancy",
x = "Hour of Day",
y = "Mean Library Occupancy"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) #biar label gk numpuk
# Library GGPlot
library(ggplot2)
# Peak Hours of Occupancy
# Rata-rata occupancy per jam per hari
hourly_occ <- df4 %>%
group_by(hour = lubridate::floor_date(time, "hour")) %>%
summarise(avg_occ = mean(Associated.Client.Count, na.rm = TRUE)) %>%
ungroup()
head(hourly_occ)
## # A tibble: 6 × 2
## hour avg_occ
## <dttm> <dbl>
## 1 2020-02-01 00:00:00 16.5
## 2 2020-02-01 01:00:00 8.27
## 3 2020-02-01 02:00:00 7.04
## 4 2020-02-01 03:00:00 7.21
## 5 2020-02-01 04:00:00 7.12
## 6 2020-02-01 05:00:00 6.60
# Rata-rata occupancy per 10 menit per hari
occ_10min <- df4 %>%
group_by(time = lubridate::floor_date(time, "10 minutes")) %>%
summarise(avg_occ = mean(Associated.Client.Count, na.rm = TRUE)) %>%
ungroup()
# Visualisasi pola occupancy per jam
ggplot(hourly_occ, aes(x = hour, y = avg_occ)) +
geom_line(color = "black") +
labs(title = "Average Hourly Occupancy in Library",
x = "Hour of Day",
y = "Average Associated Client Count") +
theme_minimal()
# Top 3 peak occupancy hours
top3 <- occ_10min %>%
arrange(desc(avg_occ)) %>%
head(3)
cat("Top 3 peak occupancy hours:\n")
## Top 3 peak occupancy hours:
print(top3)
## # A tibble: 3 × 2
## time avg_occ
## <dttm> <dbl>
## 1 2020-02-25 15:20:00 512.
## 2 2020-02-25 15:30:00 512.
## 3 2020-02-18 14:40:00 509
# Occupancy influences energy?
# Variabel
occ <- data$library_occupancy
eng <- data$total_rate
# Pearson correlation
cor_test <- cor.test(occ, eng, method = "pearson")
print(cor_test)
##
## Pearson's product-moment correlation
##
## data: occ and eng
## t = 111.43, df = 3681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8706546 0.8854374
## sample estimates:
## cor
## 0.8782556
# OLS Regression (Energy ~ Occupancy)
model <- lm(total_rate ~ library_occupancy, data = data)
summary(model)
##
## Call:
## lm(formula = total_rate ~ library_occupancy, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.328 -15.549 1.796 16.218 50.933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.037e+02 4.940e-01 210.0 <2e-16 ***
## library_occupancy 2.668e-01 2.395e-03 111.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.89 on 3681 degrees of freedom
## Multiple R-squared: 0.7713, Adjusted R-squared: 0.7713
## F-statistic: 1.242e+04 on 1 and 3681 DF, p-value: < 2.2e-16
# Scatter plot
ggplot(data, aes(x = library_occupancy, y = total_rate)) +
geom_point(alpha = 0.4, color = "blue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(
title = "Occupancy vs Energy Consumption",
x = "Library Occupancy (WiFi count)",
y = "Total Energy Consumption (rate)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Highlight Cases Energy Does Not Align Occupancy
# Data Prediksi
data$predicted_energy <- predict(model, newdata = data)
pred <- data$predicted_energy
# Hitung residuals
data$residual <- eng - pred
residual <- data$residual
# Tentukan threshold anomali
threshold <- 1.5 * sd(residual, na.rm = TRUE)
# Tandai anomalies
anomalies <- data[abs(residual) > threshold, ]
cat("Total anomalies found:", nrow(anomalies), "\n")
## Total anomalies found: 487
# Visualisasi
ggplot(data, aes(x = library_occupancy, y = total_rate)) +
geom_point(alpha = 0.4, color = "blue") +
geom_point(data = anomalies, aes(x = library_occupancy, y = total_rate),
color = "red", size = 3, shape = 21, fill = "red") +
geom_line(aes(y = predicted_energy), color = "green", size = 1) +
labs(
title = "Energy vs Occupancy with Anomalies Highlighted",
x = "Library Occupancy",
y = "Total Energy Consumption"
) +
theme_minimal()
data <- data %>%
mutate(
date = as.Date(time),
hour = hour(time),
dayofweek = wday(time, week_start = 1),
is_weekend = dayofweek >= 6
)
ggplot(data, aes(x = library_occupancy, y = total_rate, color = is_weekend)) +
geom_point(alpha = 0.5) +
scale_color_manual(
name = "Kategori Hari",
values = c("FALSE" = "blue", "TRUE" = "red"),
labels = c("Weekday", "Weekend")
) +
labs(
title = "Occupancy vs Energy Consumption (Weekday vs Weekend)",
x = "Occupancy",
y = "Energy Consumption"
) +
theme_minimal() +
theme(legend.position = "top")
energy_profiles <- data %>%
group_by(is_weekend, date, hour) %>%
summarise(total_rate = mean(total_rate, na.rm = TRUE), .groups = 'drop')
energy_mean <- energy_profiles %>%
group_by(is_weekend, hour) %>%
summarise(total_rate = mean(total_rate, na.rm = TRUE), .groups = 'drop')
occ_profiles <- data %>%
group_by(is_weekend, date, hour) %>%
summarise(library_occupancy = mean(library_occupancy, na.rm = TRUE), .groups = 'drop')
occ_mean <- occ_profiles %>%
group_by(is_weekend, hour) %>%
summarise(library_occupancy = mean(library_occupancy, na.rm = TRUE), .groups = 'drop')
ggplot() +
geom_line(data = energy_profiles,
aes(x = hour, y = total_rate, group = date, color = is_weekend),
alpha = 0.2) +
geom_line(data = energy_mean,
aes(x = hour, y = total_rate, color = is_weekend),
linewidth = 1.5) +
scale_color_manual(
name = "Kategori Hari",
values = c("FALSE" = "blue", "TRUE" = "red"),
labels = c("Weekday", "Weekend")
) +
labs(
title = "Weekday vs Weekend: Daily Profiles (Energy Consumption)",
x = "Hour of Day",
y = "Energy Consumption"
) +
scale_x_continuous(breaks = seq(0, 23, 2)) +
theme_minimal() +
theme(legend.position = "top")
ggplot() +
geom_line(data = occ_profiles,
aes(x = hour, y = library_occupancy, group = date, color = is_weekend),
alpha = 0.2) +
geom_line(data = occ_mean,
aes(x = hour, y = library_occupancy, color = is_weekend),
linewidth = 1.5) +
scale_color_manual(
name = "Kategori Hari",
values = c("FALSE" = "blue", "TRUE" = "red"),
labels = c("Weekday", "Weekend")
) +
labs(
title = "Weekday vs Weekend: Daily Profiles (Occupancy)",
x = "Hour of Day",
y = "Occupancy"
) +
scale_x_continuous(breaks = seq(0, 23, 2)) +
theme_minimal() +
theme(legend.position = "top")