library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
wifi <- read.csv("D:/Users/user/Downloads/wifi.csv")
head(wifi)
## time Event.Time Associated.Client.Count
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 184
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 6
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 18
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 23
## 5 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 45
## 6 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 16
## Authenticated.Client.Count Uni Building
## 1 182 Lancaster University Graduate College
## 2 6 Lancaster University Management School
## 3 18 Lancaster University SW hse 32-33
## 4 23 Lancaster University SW hse 29
## 5 45 Lancaster University Furness outer
## 6 16 Lancaster University Slaidburn House (LUSU)
## Floor
## 1 A Floor
## 2 C Floor
## 3 D Floor
## 4 B floor
## 5 C floor
## 6 E floor
str(wifi)
## 'data.frame': 1883844 obs. of 7 variables:
## $ time : chr "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
## $ Event.Time : chr "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
## $ Associated.Client.Count : int 184 6 18 23 45 16 32 6 73 5 ...
## $ Authenticated.Client.Count: int 182 6 18 23 45 16 32 6 73 5 ...
## $ Uni : chr "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
## $ Building : chr " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
## $ Floor : chr " A Floor" " C Floor" " D Floor" " B floor" ...
Converting a time variable to a datetime
wifi$time <- as.POSIXct(wifi$time, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
str(wifi$time)
## POSIXct[1:1883844], format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
Use only Library building WiFi records
wifi$Building <- trimws(wifi$Building)
wifi_library <- wifi[tolower(wifi$Building) == "library",
c("time", "Associated.Client.Count")]
head(wifi_library)
## time Associated.Client.Count
## 74 2020-02-01 00:02:12 29
## 158 2020-02-01 00:02:12 0
## 235 2020-02-01 00:02:12 21
## 260 2020-02-01 00:02:12 38
## 291 2020-02-01 00:07:34 0
## 497 2020-02-01 00:07:34 33
str(wifi_library)
## 'data.frame': 28652 obs. of 2 variables:
## $ time : POSIXct, format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
## $ Associated.Client.Count: int 29 0 21 38 0 33 22 39 30 0 ...
Ensure all data is standardized at 10-minute intervals: Resample WiFi data (mean of Associated Client Count).
wifi_10min <- wifi_library %>%
mutate(time = floor_date(time, "10 minutes")) %>%
group_by(time) %>%
summarise(Associated_Client_Counts = mean(Associated.Client.Count, na.rm = TRUE)) %>%
ungroup()
head(wifi_10min)
## # A tibble: 6 × 2
## time Associated_Client_Counts
## <dttm> <dbl>
## 1 2020-02-01 00:00:00 22.8
## 2 2020-02-01 00:10:00 20.6
## 3 2020-02-01 00:20:00 17.2
## 4 2020-02-01 00:30:00 15.4
## 5 2020-02-01 00:40:00 12.4
## 6 2020-02-01 00:50:00 10.5
Check Missing Value and Duplicate wifi_10min
colSums(is.na(wifi_10min))
## time Associated_Client_Counts
## 0 0
sum(is.na(wifi_10min))
## [1] 0
cat("Sum of Duplicated = ", sum(duplicated(wifi_10min)),"\n" )
## Sum of Duplicated = 0
Delete Duplicate Data
wifi_10min_clean <- wifi_10min %>%
distinct()
cat("Sum of Duplicated Now = ", sum(duplicated(wifi_10min)),"\n" )
## Sum of Duplicated Now = 0
lib1 <- read.csv("D:/Users/user/Downloads/library1.csv")
lib2 <- read.csv("D:/Users/user/Downloads/library2.csv")
lib3 <- read.csv("D:/Users/user/Downloads/library3.csv")
head(lib1)
## ts name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh 1489442 NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh 1489449 7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh 1489456 7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh 1489464 8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh 1489471 7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh 1489479 8
head(lib2)
## ts name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh 2129016 NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh 2129034 18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh 2129054 20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh 2129071 17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh 2129086 15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh 2129103 17
head(lib3)
## ts name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh 6914209 NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh 6914266 57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh 6914310 44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh 6914376 66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh 6914439 63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh 6914474 35
Change time to datetime
lib1$ts <- ymd_hms(lib1$ts, tz = "UTC")
lib2$ts <- ymd_hms(lib2$ts, tz = "UTC")
lib3$ts <- ymd_hms(lib3$ts, tz = "UTC")
str(lib1$ts)
## POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
str(lib2$ts)
## POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
str(lib3$ts)
## POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
Check Missing Value and Duplicate library data
colSums(is.na(lib1))
## ts name reading units cumulative rate
## 0 0 3041 0 3041 3047
colSums(is.na(lib2))
## ts name reading units cumulative rate
## 0 0 3041 0 3041 3047
colSums(is.na(lib3))
## ts name reading units cumulative rate
## 0 0 3041 0 3041 3047
Imputation to ‘rate variable’ with the average of the first 144 observations
mean_rate1 <- mean(lib1$rate[1:144], na.rm = TRUE)
mean_rate2 <- mean(lib2$rate[1:144], na.rm = TRUE)
mean_rate3 <- mean(lib3$rate[1:144], na.rm = TRUE)
lib1$rate[is.na(lib1$rate)] <- mean_rate1
lib2$rate[is.na(lib2$rate)] <- mean_rate2
lib3$rate[is.na(lib3$rate)] <- mean_rate3
Check missing value after imputation
colSums(is.na(lib1))
## ts name reading units cumulative rate
## 0 0 3041 0 3041 0
colSums(is.na(lib2))
## ts name reading units cumulative rate
## 0 0 3041 0 3041 0
colSums(is.na(lib3))
## ts name reading units cumulative rate
## 0 0 3041 0 3041 0
Select only the ts and rate columns, then rename them to make them clear.
lib1_sel <- lib1 %>% select(ts, rate1 = rate)
lib2_sel <- lib2 %>% select(ts, rate2 = rate)
lib3_sel <- lib3 %>% select(ts, rate3 = rate)
Merge library 1, library 2, and library 3
lib_all <- lib1_sel %>%
left_join(lib2_sel, by = "ts") %>%
left_join(lib3_sel, by = "ts") %>%
mutate(total_rate = rate1 + rate2 + rate3)
head(lib_all)
## ts rate1 rate2 rate3 total_rate
## 1 2020-01-01 00:00:00 7.384615 17.87413 39.55944 64.81818
## 2 2020-01-01 00:10:00 7.000000 18.00000 57.00000 82.00000
## 3 2020-01-01 00:20:00 7.000000 20.00000 44.00000 71.00000
## 4 2020-01-01 00:30:00 8.000000 17.00000 66.00000 91.00000
## 5 2020-01-01 00:40:00 7.000000 15.00000 63.00000 85.00000
## 6 2020-01-01 00:50:00 8.000000 17.00000 35.00000 60.00000
Rename ‘ts’ in lib_all data to ‘time’
lib_all <- lib_all %>%
rename(time = ts)
head(lib_all)
## time rate1 rate2 rate3 total_rate
## 1 2020-01-01 00:00:00 7.384615 17.87413 39.55944 64.81818
## 2 2020-01-01 00:10:00 7.000000 18.00000 57.00000 82.00000
## 3 2020-01-01 00:20:00 7.000000 20.00000 44.00000 71.00000
## 4 2020-01-01 00:30:00 8.000000 17.00000 66.00000 91.00000
## 5 2020-01-01 00:40:00 7.000000 15.00000 63.00000 85.00000
## 6 2020-01-01 00:50:00 8.000000 17.00000 35.00000 60.00000
Merge ‘lib_all’ and ‘wifi_10min’
final_data <- lib_all %>%
inner_join(wifi_10min, by = "time")
head(final_data)
## time rate1 rate2 rate3 total_rate
## 1 2020-02-01 00:00:00 7.384615 17.87413 39.55944 64.81818
## 2 2020-02-01 00:10:00 13.000000 11.00000 73.00000 97.00000
## 3 2020-02-01 00:20:00 13.000000 11.00000 72.00000 96.00000
## 4 2020-02-01 00:30:00 11.000000 15.00000 74.00000 100.00000
## 5 2020-02-01 00:40:00 11.000000 13.00000 65.00000 89.00000
## 6 2020-02-01 00:50:00 11.000000 16.00000 64.00000 91.00000
## Associated_Client_Counts
## 1 22.750
## 2 20.625
## 3 17.250
## 4 15.375
## 5 12.375
## 6 10.500
Time Series Plots: show occupancy and energy consumption patterns over the month.
ggplot(final_data, aes(x = time)) +
geom_line(aes(y = total_rate, color = "Energy Consumption"), size = 1) +
geom_line(aes(y = Associated_Client_Counts, color = "Occupancy"), size = 1) +
labs(
title = "Time Series: Energy Consumption vs Occupancy",
x = "Time",
y = "Value"
) +
scale_color_manual(
name = "Legend",
values = c("Energy Consumption" = "blue", "Occupancy" = "red")
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Scatter Plot: occupancy (X) vs energy consumption (Y).
ggplot(final_data, aes(x = Associated_Client_Counts, y = total_rate)) +
geom_point(alpha = 0.6, color = "green") +
labs(
title = "Scatter Plot: Occupancy vs Energy Consumption",
x = "Occupancy",
y = "Energy Consumption"
) +
theme_minimal()
# Daily Profiles (24h): illustrate daily cycles (each day represented by
one line of time series).
daily_profile <- final_data %>%
mutate(
date = as.Date(time),
hour = hour(time) + minute(time)/60 # jam dalam bentuk desimal
)
ENERGY CONSUMPTION
ggplot(daily_profile, aes(x = hour, y = total_rate, group = date, color = date)) +
geom_line(alpha = 0.6) +
labs(
title = "Daily Profiles (24h) - Total Rate",
x = "Hour of Day",
y = "Total Rate"
) +
theme_minimal() +
theme(legend.position = "none")
OCCUPANCY
ggplot(daily_profile, aes(x = hour, y = Associated_Client_Counts, group = date, color = date)) +
geom_line(alpha = 0.6) +
labs(
title = "Daily Profiles (24h) - Total Rate",
x = "Hour of Day",
y = "Total Rate"
) +
theme_minimal() +
theme(legend.position = "none")
ENERGY CONSUMPTION
avg_profile <- daily_profile %>%
group_by(hour) %>%
summarise(mean_total_rate = mean(total_rate, na.rm = TRUE))
ggplot(daily_profile, aes(x = hour, y = total_rate, group = date, color = date)) +
geom_line(alpha = 0.4) +
geom_line(
data = avg_profile,
aes(x = hour, y = mean_total_rate),
color = "black", size = 1.2,
inherit.aes = FALSE
) +
labs(
title = "Daily Profiles with Average Overlay - Total Rate",
x = "Hour of Day",
y = "Total Rate"
) +
theme_minimal() +
theme(legend.position = "none")
OCCUPANCY
avg_profile_clients <- daily_profile %>%
group_by(hour) %>%
summarise(mean_clients = mean(Associated_Client_Counts, na.rm = TRUE))
ggplot(daily_profile, aes(x = hour, y = Associated_Client_Counts, group = date, color = date)) +
geom_line(alpha = 0.4) +
geom_line(
data = avg_profile_clients,
aes(x = hour, y = mean_clients),
color = "black", size = 1.2,
inherit.aes = FALSE
) +
labs(
title = "Daily Profiles with Average Overlay - Associated Client Counts",
x = "Hour of Day",
y = "Associated Client Counts"
) +
theme_minimal() +
theme(legend.position = "none")
peak_hours <- daily_profile %>%
group_by(hour) %>%
summarise(mean_total_rate = mean(total_rate, na.rm = TRUE), .groups = "drop")
peak_hour <- peak_hours$hour[which.max(peak_hours$mean_total_rate)]
peak_value <- max(peak_hours$mean_total_rate, na.rm = TRUE)
cat("Jam puncak okupansi:", peak_hour, "\n",
"Rata-rata device terhubung pada jam tersebut:", round(peak_value, 2))
## Jam puncak okupansi: 15.5
## Rata-rata device terhubung pada jam tersebut: 194.44
# Visualization for peak hours
ggplot(peak_hours, aes(x = hour, y = mean_total_rate)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 2) +
labs(
title = "Average Library Occupancy by Hour",
x = "Hour of Day",
y = "Average Total Rate"
) +
theme_minimal()
# Examine whether occupancy significantly influences energy
consumption.
Use regression method
model <- lm(total_rate ~ Associated_Client_Counts, data = final_data)
summary(model)
##
## Call:
## lm(formula = total_rate ~ Associated_Client_Counts, data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.299 -15.551 1.791 16.244 50.958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.037e+02 4.943e-01 209.8 <2e-16 ***
## Associated_Client_Counts 2.669e-01 2.396e-03 111.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.9 on 3681 degrees of freedom
## Multiple R-squared: 0.7713, Adjusted R-squared: 0.7712
## F-statistic: 1.241e+04 on 1 and 3681 DF, p-value: < 2.2e-16
# Visualization of the relationship between occupancy and energy
library(ggplot2)
ggplot(final_data, aes(x = Associated_Client_Counts, y = total_rate)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "The Effect of Occupancy on Energy Consumption",
x = "Occupancy",
y = "Energy Consumption") +
theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
# Highlight cases where energy consumption does not align with
occupancy.
model <- lm(total_rate ~ Associated_Client_Counts, data = final_data, na.action = na.exclude)
final_data$residuals <- residuals(model)
threshold <- 2 * sd(final_data$residuals, na.rm = TRUE)
final_data$outlier <- abs(final_data$residuals) > threshold
outlier_cases <- final_data %>%
filter(outlier == TRUE) %>%
select(time, Associated_Client_Counts, total_rate, residuals)
print(head(outlier_cases,10))
## time Associated_Client_Counts total_rate residuals
## 1 2020-02-01 00:00:00 22.75 64.81818 -44.95039
## 2 2020-02-01 04:50:00 6.75 59.00000 -46.49781
## 3 2020-02-01 06:20:00 3.75 53.00000 -51.69705
## 4 2020-02-01 06:30:00 3.75 56.00000 -48.69705
## 5 2020-02-01 06:40:00 3.50 61.00000 -43.63032
## 6 2020-02-02 07:40:00 5.50 63.00000 -42.16416
## 7 2020-02-04 06:40:00 9.50 62.00000 -44.23185
## 8 2020-02-04 21:20:00 184.75 195.00000 41.98997
## 9 2020-02-04 22:10:00 148.00 191.00000 47.79938
## 10 2020-02-04 22:30:00 126.00 184.00000 46.67167
Visualization for outlier
ggplot(final_data, aes(x = Associated_Client_Counts, y = total_rate)) +
geom_point(aes(color = outlier), size = 2, alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "red")) +
labs(title = "Outlier: Energy Consumption vs Occupancy",
x = "Occupancy (Associated Client Counts)",
y = "Energy Consumption (Total Rate)",
color = "Outlier") +
theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
final_data$day_of_week <- weekdays(as.Date(final_data$time))
# WEEKDAYS DATASET
final_data_weekdays <- final_data %>%
filter(!(day_of_week %in% c("Saturday", "Sunday")))
# WEEKEND DATASET
final_data_weekends <- final_data %>%
filter(day_of_week %in% c("Saturday", "Sunday"))
total_weekday <- nrow(final_data_weekdays)
total_weekend <- nrow(final_data_weekends)
cat("Total Weekday data:", total_weekday, "\n")
## Total Weekday data: 2531
cat("Total Weekend data:", total_weekend, "\n")
## Total Weekend data: 1152
final_data <- final_data %>%
mutate(category = ifelse(weekdays(as.Date(time)) %in% c("Saturday", "Sunday"),
"Weekend", "Weekday"))
### 1. Time series plot (total_rate & Associated_Client_Counts)
ggplot(final_data, aes(x = time)) +
geom_line(aes(y = total_rate, color = "Total Rate")) +
geom_line(aes(y = Associated_Client_Counts, color = "Associated Client Counts")) +
facet_wrap(~category, scales = "free_x", ncol = 1) +
labs(title = "Time Series: Weekday vs Weekend",
y = "Value", color = "Legend") +
theme_minimal()
### 2. Scatter plot (Occupancy vs Energy)
ggplot(final_data, aes(x = Associated_Client_Counts, y = total_rate)) +
geom_point(alpha = 0.4, color = "steelblue") +
facet_wrap(~category) +
labs(title = "Scatter Plot: Occupancy vs Energy (Weekday vs Weekend)",
x = "Associated Client Counts (Occupancy)",
y = "Total Rate (Energy)") +
theme_minimal()
### 3. Daily profiles
final_data <- final_data %>%
mutate(hour = as.integer(format(as.POSIXct(time), "%H")))
ggplot(final_data, aes(x = hour, y = total_rate, group = as.Date(time), color = category)) +
geom_line(alpha = 0.3) +
facet_wrap(~category) +
labs(title = "Daily Profiles: Weekday vs Weekend",
x = "Hour of Day", y = "Total Rate") +
theme_minimal()
### 4. Average daily profile overlay
avg_profile <- final_data %>%
group_by(category, hour) %>%
summarise(avg_rate = mean(total_rate, na.rm = TRUE),
avg_clients = mean(Associated_Client_Counts, na.rm = TRUE),
.groups = "drop")
ggplot(avg_profile, aes(x = hour, y = avg_rate, color = category)) +
geom_line(size = 1.2) +
facet_wrap(~category) +
labs(title = "Average Daily Profiles: Weekday vs Weekend",
x = "Hour of Day", y = "Average Total Rate") +
theme_minimal()
Weekdays: Energy consumption is higher, with peak hours clearly during the day and activity continuing into the evening.
Weekends: Energy consumption is lower and shorter, indicating reduced campus activity.
These results are consistent with campus activity patterns: weekdays are filled with lectures, research, and facility use; weekends are limited to specific activities.