–>

1. Import Library

library(tidyverse)
library(lubridate)
library(zoo)    
library(broom)  
library(scales) 

2. Loading data

# Wifi
wifi <- read_csv("C:/Users/ASUS/Downloads/wifi.csv")
# Preview 
head(wifi)
## # A tibble: 6 × 7
##   time                `Event Time` Associated Client Co…¹ Authenticated Client…²
##   <dttm>              <chr>                         <dbl>                  <dbl>
## 1 2020-02-01 00:02:12 Sat Feb 01 …                    184                    182
## 2 2020-02-01 00:02:12 Sat Feb 01 …                      6                      6
## 3 2020-02-01 00:02:12 Sat Feb 01 …                     18                     18
## 4 2020-02-01 00:02:12 Sat Feb 01 …                     23                     23
## 5 2020-02-01 00:02:12 Sat Feb 01 …                     45                     45
## 6 2020-02-01 00:02:12 Sat Feb 01 …                     16                     16
## # ℹ abbreviated names: ¹​`Associated Client Count`,
## #   ²​`Authenticated Client Count`
## # ℹ 3 more variables: Uni <chr>, Building <chr>, Floor <chr>
str(wifi)
## spc_tbl_ [1,883,844 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ time                      : POSIXct[1:1883844], format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
##  $ Event Time                : chr [1:1883844] "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
##  $ Associated Client Count   : num [1:1883844] 184 6 18 23 45 16 32 6 73 5 ...
##  $ Authenticated Client Count: num [1:1883844] 182 6 18 23 45 16 32 6 73 5 ...
##  $ Uni                       : chr [1:1883844] "Lancaster University" "Lancaster University" "Lancaster University" "Lancaster University" ...
##  $ Building                  : chr [1:1883844] "Graduate College" "Management School" "SW hse 32-33" "SW hse 29" ...
##  $ Floor                     : chr [1:1883844] "A Floor" "C Floor" "D Floor" "B floor" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   time = col_datetime(format = ""),
##   ..   `Event Time` = col_character(),
##   ..   `Associated Client Count` = col_double(),
##   ..   `Authenticated Client Count` = col_double(),
##   ..   Uni = col_character(),
##   ..   Building = col_character(),
##   ..   Floor = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Library energy datasets
lib1 <- read_csv("C:/Users/ASUS/Downloads/library1.csv")
lib2 <- read_csv("C:/Users/ASUS/Downloads/library2.csv")
lib3 <- read_csv("C:/Users/Asus/Downloads/library3.csv")

head(lib1); head(lib2); head(lib3)
## # A tibble: 6 × 6
##   ts                  name              reading units cumulative  rate
##   <dttm>              <chr>               <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh      1489442    NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh      1489449     7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh      1489456     7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh      1489464     8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh      1489471     7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh      1489479     8
## # A tibble: 6 × 6
##   ts                  name               reading units cumulative  rate
##   <dttm>              <chr>                <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh      2129016    NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh      2129034    18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh      2129054    20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh      2129071    17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh      2129086    15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh      2129103    17
## # A tibble: 6 × 6
##   ts                  name               reading units cumulative  rate
##   <dttm>              <chr>                <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh      6914209    NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh      6914266    57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh      6914310    44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh      6914376    66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh      6914439    63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh      6914474    35
str(lib1)
## spc_tbl_ [18,864 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ts        : POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
##  $ name      : chr [1:18864] "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" ...
##  $ reading   : num [1:18864] 1489442 1489449 1489456 1489464 1489471 ...
##  $ units     : chr [1:18864] "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num [1:18864] 1489442 1489449 1489456 1489464 1489471 ...
##  $ rate      : num [1:18864] NA 7 7 8 7 8 7 8 7 8 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ts = col_datetime(format = ""),
##   ..   name = col_character(),
##   ..   reading = col_double(),
##   ..   units = col_character(),
##   ..   cumulative = col_double(),
##   ..   rate = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

3. Data Cleaning and Integration

3.1 Wifi Dataset

# Convert 'time' to datetime
wifi <- wifi %>% mutate(time = ymd_hms(time))
str(wifi)
## tibble [1,883,844 × 7] (S3: tbl_df/tbl/data.frame)
##  $ time                      : POSIXct[1:1883844], format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
##  $ Event Time                : chr [1:1883844] "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
##  $ Associated Client Count   : num [1:1883844] 184 6 18 23 45 16 32 6 73 5 ...
##  $ Authenticated Client Count: num [1:1883844] 182 6 18 23 45 16 32 6 73 5 ...
##  $ Uni                       : chr [1:1883844] "Lancaster University" "Lancaster University" "Lancaster University" "Lancaster University" ...
##  $ Building                  : chr [1:1883844] "Graduate College" "Management School" "SW hse 32-33" "SW hse 29" ...
##  $ Floor                     : chr [1:1883844] "A Floor" "C Floor" "D Floor" "B floor" ...
# Lihat unique Building
unique(wifi$Building)
##  [1] "Graduate College"                 "Management School"               
##  [3] "SW hse 32-33"                     "SW hse 29"                       
##  [5] "Furness outer"                    "Slaidburn House (LUSU)"          
##  [7] "SW hse 34"                        "Bowland hall"                    
##  [9] "Fylde"                            "SW hse 55-56"                    
## [11] "SW hse 40-42"                     "Bowland Twr (Old Bowland Annexe)"
## [13] "Pendle"                           "Faraday and cTAP"                
## [15] "Institute for Advanced Studies"   "University House"                
## [17] "SW hse 53-54"                     "Engineering"                     
## [19] "Field Station"                    "SW hse 36"                       
## [21] "Infolab"                          "SW hse 21-23"                    
## [23] "LICA"                             "FU Hse 71-74"                    
## [25] "Grizedale"                        "Charles Carter"                  
## [27] "Furness"                          "SW hse 43-45"                    
## [29] "SW hse 12-16"                     "Bowland Main"                    
## [31] "SW hse 35"                        "Human Resources"                 
## [33] "County"                           "FY Hse 65-70"                    
## [35] "Barker House Farm"                "SW hse 27-28"                    
## [37] "Bowland Annexe"                   "John Creed"                      
## [39] "grize-res"                        "SW hse 24-26"                    
## [41] "SW hse 20"                        "Ruskin Library"                  
## [43] "County South/Cartmel"             "Library"                         
## [45] "Conference Centre"                "Postgrad Stats (PSC)"            
## [47] "Bowland North"                    "George Fox"                      
## [49] "Hse 75 77"                        "Bailrigg House"                  
## [51] "Sports Centre"                    "SW hse 30-31"                    
## [53] "Bowland Ash"                      "Alex Square"                     
## [55] "LEC"                              "MDC"                             
## [57] "ISS Building"                     "Chaplaincy Centre"               
## [59] "CETAD"                            "SW hse 50-52"                    
## [61] "SW hse 17-19"                     "Central Workshops"               
## [63] "SW hse 46-49"                     "SW hse 39"                       
## [65] "Science and technology"           "Whewell"                         
## [67] "SW hse 37-38"                     "Lonsdale College (SW)"           
## [69] "Physics"                          "County Field"                    
## [71] "Great Hall"                       "SW hse-158-179"                  
## [73] "Preschool"                        "Reception"                       
## [75] "Hazelrigg"                        "Energy Centre"
# Filter hanya rekaman Library
wifi <- wifi %>% mutate(Building = str_trim(Building))
wifi1 <- wifi %>% filter(Building == "Library") %>%
  select(time, `Associated Client Count`)

# Resample / aggregate ke interval 10 menit (rata-rata Associated Client Count)
wifi1 <- wifi1 %>%
  mutate(time = floor_date(time, unit = "10 minutes")) %>%
  group_by(time) %>%
  summarise(Mean_Client = mean(`Associated Client Count`, na.rm = TRUE)) %>%
  ungroup()

# Cek missing dan duplicate
wifi1 %>% summarise_all(~ sum(is.na(.)))
## # A tibble: 1 × 2
##    time Mean_Client
##   <int>       <int>
## 1     0           0
sum(duplicated(wifi1))
## [1] 0
head(wifi1)
## # A tibble: 6 × 2
##   time                Mean_Client
##   <dttm>                    <dbl>
## 1 2020-02-01 00:00:00        22.8
## 2 2020-02-01 00:10:00        20.6
## 3 2020-02-01 00:20:00        17.2
## 4 2020-02-01 00:30:00        15.4
## 5 2020-02-01 00:40:00        12.4
## 6 2020-02-01 00:50:00        10.5

3.2 Library Energy Dataset

# Convert 'ts' ke datetime
lib1 <- lib1 %>% mutate(ts = ymd_hms(ts))
lib2 <- lib2 %>% mutate(ts = ymd_hms(ts))
lib3 <- lib3 %>% mutate(ts = ymd_hms(ts))

# Cek persentase missing
for(i in list(lib1, lib2, lib3)){
  df <- i
  print(100 * sum(is.na(df$rate)) / nrow(df))
}
## [1] 16.15246
## [1] 16.15246
## [1] 16.15246
# Hitung mean dari 144 pengamatan pertama per meter dan isi NA
mean_lib1 <- mean(lib1$rate[1:144], na.rm = TRUE)
mean_lib2 <- mean(lib2$rate[1:144], na.rm = TRUE)
mean_lib3 <- mean(lib3$rate[1:144], na.rm = TRUE)

lib1 <- lib1 %>% mutate(rate = if_else(is.na(rate), mean_lib1, rate))
lib2 <- lib2 %>% mutate(rate = if_else(is.na(rate), mean_lib2, rate))
lib3 <- lib3 %>% mutate(rate = if_else(is.na(rate), mean_lib3, rate))

# Cek lagi missing
for(i in list(lib1, lib2, lib3)){
  df <- i
  print(100 * sum(is.na(df$rate)) / nrow(df))
}
## [1] 0
## [1] 0
## [1] 0
# Cek duplicate
cat("Duplicates lib1:", sum(duplicated(lib1)), "\n")
## Duplicates lib1: 21
cat("Duplicates lib2:", sum(duplicated(lib2)), "\n")
## Duplicates lib2: 21
cat("Duplicates lib3:", sum(duplicated(lib3)), "\n")
## Duplicates lib3: 21

3.3 Merge tiga dataset listrik

# Pilih ts & rate lalu gabung pada ts
elec12 <- lib1 %>% select(ts, rate) %>%
  rename(rate_1 = rate) %>%
  inner_join(lib2 %>% select(ts, rate) %>% rename(rate_2 = rate), by = "ts")

electricity <- elec12 %>%
  inner_join(lib3 %>% select(ts, rate) %>% rename(rate_3 = rate), by = "ts") %>%
  mutate(total_electricity = rate_1 + rate_2 + rate_3) %>%
  rename(time = ts)

# Cek missing & duplicates
sum(is.na(electricity))
## [1] 2248091
sum(duplicated(electricity))
## [1] 2225430
head(electricity)
## # A tibble: 6 × 5
##   time   rate_1 rate_2 rate_3 total_electricity
##   <dttm>  <dbl>  <dbl>  <dbl>             <dbl>
## 1 NA       7.38   17.9   39.6              64.8
## 2 NA       7.38   17.9   39                64.3
## 3 NA       7.38   17.9   35                60.3
## 4 NA       7.38   17.9   70                95.3
## 5 NA       7.38   17.9   40                65.3
## 6 NA       7.38   17.9   38                63.3

3.4 Standardisasi interval 10 menit untuk listrik (jika perlu)

electricity <- electricity %>%
  mutate(time = floor_date(time, unit = "10 minutes")) %>%
  group_by(time) %>%
  summarise(rate_1 = mean(rate_1, na.rm = TRUE),
            rate_2 = mean(rate_2, na.rm = TRUE),
            rate_3 = mean(rate_3, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(total_electricity = rate_1 + rate_2 + rate_3)

head(electricity)
## # A tibble: 6 × 5
##   time                rate_1 rate_2 rate_3 total_electricity
##   <dttm>               <dbl>  <dbl>  <dbl>             <dbl>
## 1 2020-01-01 00:10:00      7     18     57                82
## 2 2020-01-01 00:20:00      7     20     44                71
## 3 2020-01-01 00:30:00      8     17     66                91
## 4 2020-01-01 00:40:00      7     15     63                85
## 5 2020-01-01 00:50:00      8     17     35                60
## 6 2020-01-01 01:00:00      7     17     47                71

3.5 Gabungkan Wifi dan Electricity

# Inner join pada waktu yang sama
data <- inner_join(wifi1, electricity, by = "time")

# Pilih kolom yang relevan
df <- data %>% select(time, Mean_Client, total_electricity)

head(df)
## # A tibble: 6 × 3
##   time                Mean_Client total_electricity
##   <dttm>                    <dbl>             <dbl>
## 1 2020-02-01 00:10:00       20.6                 97
## 2 2020-02-01 00:20:00       17.2                 96
## 3 2020-02-01 00:30:00       15.4                100
## 4 2020-02-01 00:40:00       12.4                 89
## 5 2020-02-01 00:50:00       10.5                 91
## 6 2020-02-01 01:00:00        9.25                90

4. Visualisasi

4.1 Time Series Plot of Occupancy and Electricity Consumption

library(ggplot2)

ggplot(df) +
  geom_line(aes(x = time, y = Mean_Client, colour = "Occupancy (WiFi)")) +
  geom_line(aes(x = time, y = total_electricity, colour = "Electricity")) +
  labs(title = "Library Occupancy vs Electricity Consumption (Time Series)",
       x = "Time", y = "Counts / Electricity Rate") +
  scale_colour_manual(name = "Legend", values = c("Occupancy (WiFi)" = "blue", "Electricity" = "red")) +
  theme_minimal()

4.2 Scatter Plot: Occupancy (X) vs Electricity (Y)

ggplot(df, aes(x = Mean_Client, y = total_electricity)) +
  geom_point(alpha = 0.5) +
  labs(title = "Scatter Plot: Occupancy vs Electricity Consumption",
       x = "Library Occupancy (Mean Client Count)",
       y = "Electricity Consumption (Total Rate)") +
  theme_minimal()

## 4.3 Average Daily Profile (24h)

library(tidyverse)
library(lubridate)

df <- df %>%
  mutate(hour = hour(time))

daily_profile <- df %>%
  group_by(hour) %>%
  summarise(
    Mean_Client      = mean(Mean_Client, na.rm = TRUE),
    total_electricity = mean(total_electricity, na.rm = TRUE),
    .groups = "drop"
  )

p <- ggplot(daily_profile, aes(x = hour)) +
  geom_line(aes(y = Mean_Client, colour = "Avg Occupancy"), linewidth = 1) +
  geom_line(aes(y = total_electricity, colour = "Avg Electricity"), linewidth = 1) +
  scale_x_continuous(
    breaks = 0:23,
    labels = sprintf("%02d:00", 0:23)
  ) +
  labs(
    title = "Average Daily Profile: Occupancy vs Electricity",
    x = "Hour of Day",
    y = "Average Value"
  ) +
  scale_colour_manual(
    name   = "",
    values = c("Avg Occupancy" = "blue", "Avg Electricity" = "red")
  ) +
  theme_minimal(base_size = 13)

print(p)

5. Analisis

5.1 Peak hours of occupancy

library(dplyr)

peak_hours <- df %>%
  mutate(hour = hour(time)) %>%               # ambil angka jam
  group_by(hour) %>%
  summarise(avg_client = mean(Mean_Client, na.rm = TRUE),
            .groups = "drop") %>%
  arrange(desc(avg_client)) %>%
  mutate(hour_label = sprintf("%02d:00", hour))   # ubah jadi "15:00"

cat("Peak hours of occupancy (top 5):\n")
## Peak hours of occupancy (top 5):
print(head(peak_hours[, c("hour_label", "avg_client")], 5))
## # A tibble: 5 × 2
##   hour_label avg_client
##   <chr>           <dbl>
## 1 15:00            370.
## 2 14:00            357.
## 3 16:00            346.
## 4 13:00            331.
## 5 12:00            300.
# Peak occupancy pada timestamp spesifik
peak_timestamp <- df %>%
  filter(Mean_Client == max(Mean_Client, na.rm = TRUE)) %>%
  pull(time)

cat("\nPeak occupancy at specific timestamp:\n")
## 
## Peak occupancy at specific timestamp:
print(peak_timestamp)
## [1] "2020-02-25 15:20:00 UTC"

5.2 Korelasi & Regresi Linear

# Siapkan data untuk regresi
df_reg <- df %>% drop_na(Mean_Client, total_electricity)

model <- lm(total_electricity ~ Mean_Client, data = df_reg)
summary(model)
## 
## Call:
## lm(formula = total_electricity ~ Mean_Client, data = df_reg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57.17 -15.63   1.72  16.24  51.08 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.036e+02  4.965e-01   208.6   <2e-16 ***
## Mean_Client 2.673e-01  2.399e-03   111.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.89 on 3655 degrees of freedom
## Multiple R-squared:  0.7726, Adjusted R-squared:  0.7725 
## F-statistic: 1.241e+04 on 1 and 3655 DF,  p-value: < 2.2e-16
# fitted & residual
df_reg <- df_reg %>%
  mutate(fitted = fitted(model), residual = total_electricity - fitted)

threshold <- 2 * sd(df_reg$residual, na.rm = TRUE)
df_reg <- df_reg %>% mutate(unusual = abs(residual) > threshold)

# Plot observed vs fitted time series
p1 <- ggplot(df_reg) +
  geom_line(aes(x = time, y = total_electricity, colour = "Observed"), alpha = 0.7) +
  geom_line(aes(x = time, y = fitted, colour = "Fitted (Regression)"), linetype = "dashed") +
  geom_line(aes(x = time, y = Mean_Client, colour = "Occupancy (WiFi)"), alpha = 0.7) +
  scale_colour_manual(values = c("Observed" = "red", "Fitted (Regression)" = "brown", "Occupancy (WiFi)" = "blue")) +
  labs(title = "Regresi Linear: Konsumsi Listrik ~ Okupansi", x = "Waktu", y = "Nilai") +
  theme_minimal()

print(p1)

# Scatter plot highlight 'unusual'
p2 <- ggplot(df_reg, aes(x = Mean_Client,
                         y = total_electricity,
                         colour = unusual)) +
  geom_point(alpha = 0.6) +
  scale_colour_manual(
    name   = "Status",
    values = c("TRUE" = "red", "FALSE" = "blue"),
    labels = c("TRUE" = "Tidak Sejalan", "FALSE" = "Sejalan")
  ) +
  labs(
    title = "Okupansi vs Konsumsi Listrik",
    subtitle = "Merah = Kasus Tidak Sejalan",
    x = "Mean Client (Okupansi)",
    y = "Total Electricity"
  ) +
  theme_minimal()

print(p2)

— 6. Weekend vs Weekday Comparison —

df <- df %>% 
  mutate(dayofweek = wday(time) - 1,   # 0 = Senin .. 6 = Minggu
         day_type = if_else(dayofweek < 5, "Weekday", "Weekend"))

# Time series comparison (overlay)
p_ts <- ggplot() +
  geom_line(data = df, aes(x = time, y = total_electricity, colour = day_type), alpha = 0.6) +
  geom_line(data = df, aes(x = time, y = Mean_Client, colour = day_type), alpha = 0.6, linetype = "dashed") +
  labs(title = "Perbandingan Time Series: Weekday vs Weekend",
       x = "Waktu", y = "Nilai") +
  theme_minimal()
print(p_ts)

# Scatter plot per kategori
p_sc <- ggplot(df, aes(x = Mean_Client, y = total_electricity, colour = day_type)) +
  geom_point(alpha = 0.5) +
  labs(title = "Scatter Plot Okupansi vs Konsumsi Listrik\nWeekday vs Weekend") +
  theme_minimal()
print(p_sc)

# --- Profil harian rata-rata per kategori ---
# 1) Hitung nilai maksimum dari semua Mean_Client & total_electricity
max_y <- df %>%
  summarise(
    max_occ = max(Mean_Client, na.rm = TRUE),
    max_ele = max(total_electricity, na.rm = TRUE)
  ) %>%
  as.numeric() %>%
  max()

# 2) Plot per kategori dengan Y limit yang sama
for(cat in unique(df$day_type)){
  profile <- df %>% 
    filter(day_type == cat) %>%
    mutate(hour_num  = hour(time),
           hour_lbl  = sprintf("%02d:00", hour_num)) %>%
    group_by(hour_num, hour_lbl) %>%
    summarise(
      Mean_Client = mean(Mean_Client, na.rm = TRUE),
      total_electricity = mean(total_electricity, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    arrange(hour_num)

  p <- ggplot(profile, aes(x = hour_num)) +
    geom_line(aes(y = Mean_Client, colour = "Okupansi (WiFi)")) +
    geom_line(aes(y = total_electricity, colour = "Konsumsi Listrik")) +
    scale_x_continuous(
      breaks = 0:23,
      labels = sprintf("%02d:00", 0:23)
    ) +
    scale_y_continuous(limits = c(0, max_y)) +
    labs(title = paste0("Profil Harian Rata-rata (", cat, ")"),
         x = "Jam", y = "Rata-rata") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  print(p)
}