LIBRARY

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(ggplot2)

DATA INTEGRATING AND DATA CLEANING

WiFi Dataset

wifi <-  read.csv("D:/Users/user/Downloads/wifi.csv")
head(wifi)

##                  time                   Event.Time Associated.Client.Count
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                     184
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                       6
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      18
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      23
## 5 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      45
## 6 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      16
##   Authenticated.Client.Count                   Uni                 Building
## 1                        182 Lancaster University         Graduate College 
## 2                          6 Lancaster University        Management School 
## 3                         18 Lancaster University             SW hse 32-33 
## 4                         23 Lancaster University                SW hse 29 
## 5                         45 Lancaster University            Furness outer 
## 6                         16 Lancaster University   Slaidburn House (LUSU) 
##      Floor
## 1  A Floor
## 2  C Floor
## 3  D Floor
## 4  B floor
## 5  C floor
## 6  E floor

str(wifi)

## 'data.frame':    1883844 obs. of  7 variables:
##  $ time                      : chr  "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
##  $ Event.Time                : chr  "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
##  $ Associated.Client.Count   : int  184 6 18 23 45 16 32 6 73 5 ...
##  $ Authenticated.Client.Count: int  182 6 18 23 45 16 32 6 73 5 ...
##  $ Uni                       : chr  "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
##  $ Building                  : chr  " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
##  $ Floor                     : chr  " A Floor" " C Floor" " D Floor" " B floor" ...

Converting a time variable to a datetime

wifi$time <- as.POSIXct(wifi$time, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
str(wifi$time)

##  POSIXct[1:1883844], format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...

Use only Library building WiFi records

wifi$Building <- trimws(wifi$Building)
wifi_library <- wifi[tolower(wifi$Building) == "library", 
                     c("time", "Associated.Client.Count")]
head(wifi_library)

##                    time Associated.Client.Count
## 74  2020-02-01 00:02:12                      29
## 158 2020-02-01 00:02:12                       0
## 235 2020-02-01 00:02:12                      21
## 260 2020-02-01 00:02:12                      38
## 291 2020-02-01 00:07:34                       0
## 497 2020-02-01 00:07:34                      33

str(wifi_library)

## 'data.frame':    28652 obs. of  2 variables:
##  $ time                   : POSIXct, format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
##  $ Associated.Client.Count: int  29 0 21 38 0 33 22 39 30 0 ...

Ensure all data is standardized at 10-minute intervals: Resample WiFi data (mean of Associated Client Count).

wifi_10min <- wifi_library %>%
  mutate(time = floor_date(time, "10 minutes")) %>%
  group_by(time) %>%
  summarise(Associated_Client_Counts = mean(Associated.Client.Count, na.rm = TRUE)) %>%
  ungroup()
head(wifi_10min)

## # A tibble: 6 × 2
##   time                Associated_Client_Counts
##   <dttm>                                 <dbl>
## 1 2020-02-01 00:00:00                     22.8
## 2 2020-02-01 00:10:00                     20.6
## 3 2020-02-01 00:20:00                     17.2
## 4 2020-02-01 00:30:00                     15.4
## 5 2020-02-01 00:40:00                     12.4
## 6 2020-02-01 00:50:00                     10.5

Check Missing Value and Duplicate wifi_10min

colSums(is.na(wifi_10min))

##                     time Associated_Client_Counts 
##                        0                        0

sum(is.na(wifi_10min))

## [1] 0

cat("Sum of Duplicated = ", sum(duplicated(wifi_10min)),"\n" )

## Sum of Duplicated =  0

Delete Duplicate Data

wifi_10min_clean <- wifi_10min %>%
  distinct()
cat("Sum of Duplicated Now = ", sum(duplicated(wifi_10min)),"\n" )

## Sum of Duplicated Now =  0

Library Dataset

lib1 <- read.csv("D:/Users/user/Downloads/library1.csv")
lib2 <- read.csv("D:/Users/user/Downloads/library2.csv")
lib3 <- read.csv("D:/Users/user/Downloads/library3.csv")

head(lib1)

##                    ts              name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442   KWh    1489442   NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449   KWh    1489449    7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456   KWh    1489456    7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464   KWh    1489464    8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471   KWh    1489471    7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479   KWh    1489479    8

head(lib2)

##                    ts               name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016   KWh    2129016   NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034   KWh    2129034   18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054   KWh    2129054   20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071   KWh    2129071   17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086   KWh    2129086   15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103   KWh    2129103   17

head(lib3)

##                    ts               name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209   KWh    6914209   NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266   KWh    6914266   57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310   KWh    6914310   44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376   KWh    6914376   66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439   KWh    6914439   63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474   KWh    6914474   35

Change time to datetime

lib1$ts <- ymd_hms(lib1$ts, tz = "UTC")
lib2$ts <- ymd_hms(lib2$ts, tz = "UTC")
lib3$ts <- ymd_hms(lib3$ts, tz = "UTC")

str(lib1$ts)

##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...

str(lib2$ts)

##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...

str(lib3$ts)

##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...

Check Missing Value and Duplicate library data

colSums(is.na(lib1))

##         ts       name    reading      units cumulative       rate 
##          0          0       3041          0       3041       3047

colSums(is.na(lib2))

##         ts       name    reading      units cumulative       rate 
##          0          0       3041          0       3041       3047

colSums(is.na(lib3))

##         ts       name    reading      units cumulative       rate 
##          0          0       3041          0       3041       3047

Imputation to ‘rate variable’ with the average of the first 144 observations

mean_rate1 <- mean(lib1$rate[1:144], na.rm = TRUE)
mean_rate2 <- mean(lib2$rate[1:144], na.rm = TRUE)
mean_rate3 <- mean(lib3$rate[1:144], na.rm = TRUE)

lib1$rate[is.na(lib1$rate)] <- mean_rate1
lib2$rate[is.na(lib2$rate)] <- mean_rate2
lib3$rate[is.na(lib3$rate)] <- mean_rate3

Check missing value after imputation

colSums(is.na(lib1))

##         ts       name    reading      units cumulative       rate 
##          0          0       3041          0       3041          0

colSums(is.na(lib2))

##         ts       name    reading      units cumulative       rate 
##          0          0       3041          0       3041          0

colSums(is.na(lib3))

##         ts       name    reading      units cumulative       rate 
##          0          0       3041          0       3041          0

Select only the ts and rate columns, then rename them to make them clear.

lib1_sel <- lib1 %>% select(ts, rate1 = rate)
lib2_sel <- lib2 %>% select(ts, rate2 = rate)
lib3_sel <- lib3 %>% select(ts, rate3 = rate)

Merge library 1, library 2, and library 3

lib_all <- lib1_sel %>%
  left_join(lib2_sel, by = "ts") %>%
  left_join(lib3_sel, by = "ts") %>%
  mutate(total_rate = rate1 + rate2 + rate3)
head(lib_all)

##                    ts    rate1    rate2    rate3 total_rate
## 1 2020-01-01 00:00:00 7.384615 17.87413 39.55944   64.81818
## 2 2020-01-01 00:10:00 7.000000 18.00000 57.00000   82.00000
## 3 2020-01-01 00:20:00 7.000000 20.00000 44.00000   71.00000
## 4 2020-01-01 00:30:00 8.000000 17.00000 66.00000   91.00000
## 5 2020-01-01 00:40:00 7.000000 15.00000 63.00000   85.00000
## 6 2020-01-01 00:50:00 8.000000 17.00000 35.00000   60.00000

Rename ‘ts’ in lib_all data to ‘time’

lib_all <- lib_all %>%
  rename(time = ts)

head(lib_all)

##                  time    rate1    rate2    rate3 total_rate
## 1 2020-01-01 00:00:00 7.384615 17.87413 39.55944   64.81818
## 2 2020-01-01 00:10:00 7.000000 18.00000 57.00000   82.00000
## 3 2020-01-01 00:20:00 7.000000 20.00000 44.00000   71.00000
## 4 2020-01-01 00:30:00 8.000000 17.00000 66.00000   91.00000
## 5 2020-01-01 00:40:00 7.000000 15.00000 63.00000   85.00000
## 6 2020-01-01 00:50:00 8.000000 17.00000 35.00000   60.00000

Merge ‘lib_all’ and ‘wifi_10min’

final_data <- lib_all %>%
  inner_join(wifi_10min, by = "time")

head(final_data)

##                  time     rate1    rate2    rate3 total_rate
## 1 2020-02-01 00:00:00  7.384615 17.87413 39.55944   64.81818
## 2 2020-02-01 00:10:00 13.000000 11.00000 73.00000   97.00000
## 3 2020-02-01 00:20:00 13.000000 11.00000 72.00000   96.00000
## 4 2020-02-01 00:30:00 11.000000 15.00000 74.00000  100.00000
## 5 2020-02-01 00:40:00 11.000000 13.00000 65.00000   89.00000
## 6 2020-02-01 00:50:00 11.000000 16.00000 64.00000   91.00000
##   Associated_Client_Counts
## 1                   22.750
## 2                   20.625
## 3                   17.250
## 4                   15.375
## 5                   12.375
## 6                   10.500

DATA VISUALIZATION

Time Series Plots: show occupancy and energy consumption patterns over the month.

ggplot(final_data, aes(x = time)) +
  geom_line(aes(y = total_rate, color = "Energy Consumption"), size = 1) +
  geom_line(aes(y = Associated_Client_Counts, color = "Occupancy"), size = 1) +
  labs(
    title = "Time Series: Energy Consumption vs Occupancy",
    x = "Time",
    y = "Value"
  ) +
  scale_color_manual(
    name = "Legend",
    values = c("Energy Consumption" = "blue", "Occupancy" = "red")
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Scatter Plot: occupancy (X) vs energy consumption (Y).

ggplot(final_data, aes(x = Associated_Client_Counts, y = total_rate)) +
  geom_point(alpha = 0.6, color = "green") +
  labs(
    title = "Scatter Plot: Occupancy vs Energy Consumption",
    x = "Occupancy",
    y = "Energy Consumption"
  ) +
  theme_minimal()

# Daily Profiles (24h): illustrate daily cycles (each day represented by one line of time series).

daily_profile <- final_data %>%
  mutate(
    date = as.Date(time),
    hour = hour(time) + minute(time)/60   # jam dalam bentuk desimal
  )

ENERGY CONSUMPTION

ggplot(daily_profile, aes(x = hour, y = total_rate, group = date, color = date)) +
  geom_line(alpha = 0.6) +
  labs(
    title = "Daily Profiles (24h) - Total Rate",
    x = "Hour of Day",
    y = "Total Rate"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

OCCUPANCY

ggplot(daily_profile, aes(x = hour, y = Associated_Client_Counts, group = date, color = date)) +
  geom_line(alpha = 0.6) +
  labs(
    title = "Daily Profiles (24h) - Total Rate",
    x = "Hour of Day",
    y = "Total Rate"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Overlay the average daily profile for general patterns.

ENERGY CONSUMPTION

avg_profile <- daily_profile %>%
  group_by(hour) %>%
  summarise(mean_total_rate = mean(total_rate, na.rm = TRUE))

ggplot(daily_profile, aes(x = hour, y = total_rate, group = date, color = date)) +
  geom_line(alpha = 0.4) +
  geom_line(
    data = avg_profile,
    aes(x = hour, y = mean_total_rate),
    color = "black", size = 1.2,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Daily Profiles with Average Overlay - Total Rate",
    x = "Hour of Day",
    y = "Total Rate"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

OCCUPANCY

avg_profile_clients <- daily_profile %>%
  group_by(hour) %>%
  summarise(mean_clients = mean(Associated_Client_Counts, na.rm = TRUE))
ggplot(daily_profile, aes(x = hour, y = Associated_Client_Counts, group = date, color = date)) +
  geom_line(alpha = 0.4) +
  geom_line(
    data = avg_profile_clients,
    aes(x = hour, y = mean_clients),
    color = "black", size = 1.2,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Daily Profiles with Average Overlay - Associated Client Counts",
    x = "Hour of Day",
    y = "Associated Client Counts"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ANALYSIS

Identify peak hours of occupancy in the library.

peak_hours <- daily_profile %>%
  group_by(hour) %>%
  summarise(mean_total_rate = mean(total_rate, na.rm = TRUE), .groups = "drop")

peak_hour <- peak_hours$hour[which.max(peak_hours$mean_total_rate)]
peak_value <- max(peak_hours$mean_total_rate, na.rm = TRUE)

cat("Jam puncak okupansi:", peak_hour, "\n",
    "Rata-rata device terhubung pada jam tersebut:", round(peak_value, 2))

## Jam puncak okupansi: 15.5 
##  Rata-rata device terhubung pada jam tersebut: 194.44

# Visualization for peak hours
ggplot(peak_hours, aes(x = hour, y = mean_total_rate)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(
    title = "Average Library Occupancy by Hour",
    x = "Hour of Day",
    y = "Average Total Rate"
  ) +
  theme_minimal()

# Examine whether occupancy significantly influences energy consumption.

Use regression method

model <- lm(total_rate ~ Associated_Client_Counts, data = final_data)
summary(model)

## 
## Call:
## lm(formula = total_rate ~ Associated_Client_Counts, data = final_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.299 -15.551   1.791  16.244  50.958 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.037e+02  4.943e-01   209.8   <2e-16 ***
## Associated_Client_Counts 2.669e-01  2.396e-03   111.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.9 on 3681 degrees of freedom
## Multiple R-squared:  0.7713, Adjusted R-squared:  0.7712 
## F-statistic: 1.241e+04 on 1 and 3681 DF,  p-value: < 2.2e-16

# Visualization of the relationship between occupancy and energy
library(ggplot2)
ggplot(final_data, aes(x = Associated_Client_Counts, y = total_rate)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "The Effect of Occupancy on Energy Consumption",
       x = "Occupancy",
       y = "Energy Consumption") +
  theme_minimal(base_size = 14)

## `geom_smooth()` using formula = 'y ~ x'

# Highlight cases where energy consumption does not align with occupancy.

model <- lm(total_rate ~ Associated_Client_Counts, data = final_data, na.action = na.exclude)
final_data$residuals <- residuals(model)
threshold <- 2 * sd(final_data$residuals, na.rm = TRUE)
final_data$outlier <- abs(final_data$residuals) > threshold
outlier_cases <- final_data %>%
  filter(outlier == TRUE) %>%
  select(time, Associated_Client_Counts, total_rate, residuals)

print(head(outlier_cases,10))

##                   time Associated_Client_Counts total_rate residuals
## 1  2020-02-01 00:00:00                    22.75   64.81818 -44.95039
## 2  2020-02-01 04:50:00                     6.75   59.00000 -46.49781
## 3  2020-02-01 06:20:00                     3.75   53.00000 -51.69705
## 4  2020-02-01 06:30:00                     3.75   56.00000 -48.69705
## 5  2020-02-01 06:40:00                     3.50   61.00000 -43.63032
## 6  2020-02-02 07:40:00                     5.50   63.00000 -42.16416
## 7  2020-02-04 06:40:00                     9.50   62.00000 -44.23185
## 8  2020-02-04 21:20:00                   184.75  195.00000  41.98997
## 9  2020-02-04 22:10:00                   148.00  191.00000  47.79938
## 10 2020-02-04 22:30:00                   126.00  184.00000  46.67167

Visualization for outlier

ggplot(final_data, aes(x = Associated_Client_Counts, y = total_rate)) +
  geom_point(aes(color = outlier), size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "red")) +
  labs(title = "Outlier: Energy Consumption vs Occupancy",
       x = "Occupancy (Associated Client Counts)",
       y = "Energy Consumption (Total Rate)",
       color = "Outlier") +
  theme_minimal(base_size = 14)

## `geom_smooth()` using formula = 'y ~ x'

WEEKDAYS VS WEEKEND

Separate the dataset into weekdays (Mon–Fri) and weekends (Sat–Sun).

final_data$day_of_week <- weekdays(as.Date(final_data$time))

# WEEKDAYS DATASET
final_data_weekdays <- final_data %>%
  filter(!(day_of_week %in% c("Saturday", "Sunday")))

# WEEKEND DATASET
final_data_weekends <- final_data %>%
  filter(day_of_week %in% c("Saturday", "Sunday"))

total_weekday <- nrow(final_data_weekdays)
total_weekend <- nrow(final_data_weekends)

cat("Total Weekday data:", total_weekday, "\n")

## Total Weekday data: 2531

cat("Total Weekend data:", total_weekend, "\n")

## Total Weekend data: 1152

Reproduce the previous visualizations for both categories.

final_data <- final_data %>%
  mutate(category = ifelse(weekdays(as.Date(time)) %in% c("Saturday", "Sunday"),
                           "Weekend", "Weekday"))

### 1. Time series plot (total_rate & Associated_Client_Counts)
ggplot(final_data, aes(x = time)) +
  geom_line(aes(y = total_rate, color = "Total Rate")) +
  geom_line(aes(y = Associated_Client_Counts, color = "Associated Client Counts")) +
  facet_wrap(~category, scales = "free_x", ncol = 1) +
  labs(title = "Time Series: Weekday vs Weekend",
       y = "Value", color = "Legend") +
  theme_minimal()

### 2. Scatter plot (Occupancy vs Energy)
ggplot(final_data, aes(x = Associated_Client_Counts, y = total_rate)) +
  geom_point(alpha = 0.4, color = "steelblue") +
  facet_wrap(~category) +
  labs(title = "Scatter Plot: Occupancy vs Energy (Weekday vs Weekend)",
       x = "Associated Client Counts (Occupancy)",
       y = "Total Rate (Energy)") +
  theme_minimal()

### 3. Daily profiles
final_data <- final_data %>%
  mutate(hour = as.integer(format(as.POSIXct(time), "%H")))

ggplot(final_data, aes(x = hour, y = total_rate, group = as.Date(time), color = category)) +
  geom_line(alpha = 0.3) +
  facet_wrap(~category) +
  labs(title = "Daily Profiles: Weekday vs Weekend",
       x = "Hour of Day", y = "Total Rate") +
  theme_minimal()

### 4. Average daily profile overlay
avg_profile <- final_data %>%
  group_by(category, hour) %>%
  summarise(avg_rate = mean(total_rate, na.rm = TRUE),
            avg_clients = mean(Associated_Client_Counts, na.rm = TRUE),
            .groups = "drop")

ggplot(avg_profile, aes(x = hour, y = avg_rate, color = category)) +
  geom_line(size = 1.2) +
  facet_wrap(~category) +
  labs(title = "Average Daily Profiles: Weekday vs Weekend",
       x = "Hour of Day", y = "Average Total Rate") +
  theme_minimal()

Compare and Interpret the Differences.

Weekdays: Energy consumption is higher, with peak hours clearly during the day and activity continuing into the evening.

Weekends: Energy consumption is lower and shorter, indicating reduced campus activity.

These results are consistent with campus activity patterns: weekdays are filled with lectures, research, and facility use; weekends are limited to specific activities.

Data Mining Madness R Adventurer

Dani Auliya Rahmat (5003231101)

2025-13-09