Import Library

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr   1.1.4     ✔ readr   2.1.5
## ✔ forcats 1.0.0     ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0     ✔ tibble  3.3.0
## ✔ purrr   1.1.0     ✔ tidyr   1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)

Membaca data dari file CSV

wifi_data <- read.csv("wifi.csv") # nolint
head(wifi_data)
##                  time                   Event.Time Associated.Client.Count
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                     184
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                       6
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      18
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      23
## 5 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      45
## 6 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      16
##   Authenticated.Client.Count                   Uni                 Building
## 1                        182 Lancaster University         Graduate College 
## 2                          6 Lancaster University        Management School 
## 3                         18 Lancaster University             SW hse 32-33 
## 4                         23 Lancaster University                SW hse 29 
## 5                         45 Lancaster University            Furness outer 
## 6                         16 Lancaster University   Slaidburn House (LUSU) 
##      Floor
## 1  A Floor
## 2  C Floor
## 3  D Floor
## 4  B floor
## 5  C floor
## 6  E floor
str(wifi_data)
## 'data.frame':    1883844 obs. of  7 variables:
##  $ time                      : chr  "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
##  $ Event.Time                : chr  "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
##  $ Associated.Client.Count   : int  184 6 18 23 45 16 32 6 73 5 ...
##  $ Authenticated.Client.Count: int  182 6 18 23 45 16 32 6 73 5 ...
##  $ Uni                       : chr  "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
##  $ Building                  : chr  " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
##  $ Floor                     : chr  " A Floor" " C Floor" " D Floor" " B floor" ...

Mengubah kolom ‘time’ menjadi format POSIXct

wifi_data$time <- ymd_hms(wifi_data$time)
str(wifi_data)
## 'data.frame':    1883844 obs. of  7 variables:
##  $ time                      : POSIXct, format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
##  $ Event.Time                : chr  "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
##  $ Associated.Client.Count   : int  184 6 18 23 45 16 32 6 73 5 ...
##  $ Authenticated.Client.Count: int  182 6 18 23 45 16 32 6 73 5 ...
##  $ Uni                       : chr  "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
##  $ Building                  : chr  " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
##  $ Floor                     : chr  " A Floor" " C Floor" " D Floor" " B floor" ...

Menampilkan shape data

dim(wifi_data)
## [1] 1883844       7

Mengecek berapa banyak nilai unik pada setiap variabel

sapply(wifi_data, function(x) length(unique(x)))
##                       time                 Event.Time 
##                       7165                       7165 
##    Associated.Client.Count Authenticated.Client.Count 
##                        895                        878 
##                        Uni                   Building 
##                          1                         76 
##                      Floor 
##                         42

Mengecek nilai unik pada kolom Building

unique(wifi_data$Building)
##  [1] " Graduate College "                 " Management School "               
##  [3] " SW hse 32-33 "                     " SW hse 29 "                       
##  [5] " Furness outer "                    " Slaidburn House (LUSU) "          
##  [7] " SW hse 34 "                        " Bowland hall "                    
##  [9] " Fylde "                            " SW hse 55-56 "                    
## [11] " SW hse 40-42 "                     " Bowland Twr (Old Bowland Annexe) "
## [13] " Pendle "                           " Faraday and cTAP "                
## [15] " Institute for Advanced Studies "   " University House "                
## [17] " SW hse 53-54 "                     " Engineering "                     
## [19] " Field Station "                    " SW hse 36 "                       
## [21] " Infolab "                          " SW hse 21-23 "                    
## [23] " LICA "                             " FU Hse 71-74 "                    
## [25] " Grizedale "                        " Charles Carter "                  
## [27] " Furness "                          " SW hse 43-45 "                    
## [29] " SW hse 12-16 "                     " Bowland Main "                    
## [31] " SW hse 35 "                        " Human Resources "                 
## [33] " County "                           " FY Hse 65-70 "                    
## [35] " Barker House Farm "                " SW hse 27-28 "                    
## [37] " Bowland Annexe "                   " John Creed "                      
## [39] " grize-res "                        " SW hse 24-26 "                    
## [41] " SW hse 20 "                        " Ruskin Library "                  
## [43] " County South/Cartmel "             " Library "                         
## [45] " Conference Centre "                " Postgrad Stats (PSC) "            
## [47] " Bowland North "                    " George Fox "                      
## [49] " Hse 75 77 "                        " Bailrigg House "                  
## [51] " Sports Centre "                    " SW hse 30-31 "                    
## [53] " Bowland Ash "                      " Alex Square "                     
## [55] " LEC "                              " MDC "                             
## [57] " ISS Building "                     " Chaplaincy Centre "               
## [59] " CETAD "                            " SW hse 50-52 "                    
## [61] " SW hse 17-19 "                     " Central Workshops "               
## [63] " SW hse 46-49 "                     " SW hse 39 "                       
## [65] " Science and technology "           " Whewell "                         
## [67] " SW hse 37-38 "                     " Lonsdale College (SW) "           
## [69] " Physics "                          " County Field "                    
## [71] " Great Hall "                       " SW hse-158-179 "                  
## [73] " Preschool "                        " Reception "                       
## [75] " Hazelrigg "                        " Energy Centre "

Filter data hanya mengambil data library dari building

library_data <- subset(wifi_data, Building == " Library ")
head(library_data)
##                    time                   Event.Time Associated.Client.Count
## 74  2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      29
## 158 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                       0
## 235 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      21
## 260 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      38
## 291 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                       0
## 497 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                      33
##     Authenticated.Client.Count                   Uni  Building     Floor
## 74                          28 Lancaster University   Library    A floor
## 158                          0 Lancaster University   Library   LG floor
## 235                         20 Lancaster University   Library    C floor
## 260                         37 Lancaster University   Library    B floor
## 291                          0 Lancaster University   Library   LG floor
## 497                         31 Lancaster University   Library    A floor

Melakukan resample library_data menjadi interval 10 menit

Resample Associated Client Count menggunakan mean

library_data_resampled <- library_data %>%
  arrange(time) %>%
  group_by(interval = floor_date(time, "10 minutes")) %>%
  summarise(mean_client_count = mean(Associated.Client.Count, na.rm = TRUE)) %>%
  ungroup()
tail(library_data_resampled)
## # A tibble: 6 × 2
##   interval            mean_client_count
##   <dttm>                          <dbl>
## 1 2020-02-26 12:50:00              374.
## 2 2020-02-26 13:00:00              377.
## 3 2020-02-26 13:10:00              392.
## 4 2020-02-26 13:20:00              411.
## 5 2020-02-26 13:30:00              430.
## 6 2020-02-26 13:40:00              432.

Membaca data library1 dari file CSV

df1 <- read.csv("library1.csv") # nolint
head(df1)
##                    ts              name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442   KWh    1489442   NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449   KWh    1489449    7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456   KWh    1489456    7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464   KWh    1489464    8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471   KWh    1489471    7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479   KWh    1489479    8

Membaca data library2 dari file CSV

df2 <- read.csv("library2.csv") # nolint
head(df2)
##                    ts               name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016   KWh    2129016   NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034   KWh    2129034   18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054   KWh    2129054   20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071   KWh    2129071   17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086   KWh    2129086   15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103   KWh    2129103   17

Membaca data library3 dari file CSV

df3 <- read.csv("library3.csv") # nolint
head(df3)
##                    ts               name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209   KWh    6914209   NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266   KWh    6914266   57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310   KWh    6914310   44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376   KWh    6914376   66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439   KWh    6914439   63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474   KWh    6914474   35

Mengubah kolom ‘time’ menjadi format POSIXct

df1$ts <- ymd_hms(df1$ts)
df2$ts <- ymd_hms(df2$ts)
df3$ts <- ymd_hms(df3$ts)

Menggabungkan ketiga data frame berdasarkan kolom ‘ts’ dan Menjumlahkan kolom rate pada setiap interval waktu; nolint

combined_data <- df1 %>%
  full_join(df2, by = "ts") %>%
  full_join(df3, by = "ts") %>%
  mutate(total_rate = rowSums(select(., starts_with("rate")), na.rm = TRUE)) %>%
  select(ts, total_rate)
head(combined_data)
##                    ts total_rate
## 1 2020-01-01 00:00:00          0
## 2 2020-01-01 00:10:00         82
## 3 2020-01-01 00:20:00         71
## 4 2020-01-01 00:30:00         91
## 5 2020-01-01 00:40:00         85
## 6 2020-01-01 00:50:00         60

imputasi nilai 0 pada kolom total_rate dengan rata-rata 144 pengamatan pertama

combined_data$total_rate[combined_data$total_rate == 0] <- mean(combined_data$total_rate[1:144], na.rm = TRUE) # nolint
head(combined_data)
##                    ts total_rate
## 1 2020-01-01 00:00:00   64.36806
## 2 2020-01-01 00:10:00   82.00000
## 3 2020-01-01 00:20:00   71.00000
## 4 2020-01-01 00:30:00   91.00000
## 5 2020-01-01 00:40:00   85.00000
## 6 2020-01-01 00:50:00   60.00000

Menggabungkan library_data_resampled dengan combined_data berdasarkan interval waktu # nolint

final_data <- library_data_resampled %>%
  inner_join(combined_data, by = c("interval" = "ts"))
head(final_data)
## # A tibble: 6 × 3
##   interval            mean_client_count total_rate
##   <dttm>                          <dbl>      <dbl>
## 1 2020-02-01 00:00:00              22.8       64.4
## 2 2020-02-01 00:10:00              20.6       97  
## 3 2020-02-01 00:20:00              17.2       96  
## 4 2020-02-01 00:30:00              15.4      100  
## 5 2020-02-01 00:40:00              12.4       89  
## 6 2020-02-01 00:50:00              10.5       91

Melihat shape final_data

dim(final_data)
## [1] 3683    3

Mengecek missing value pada final_data

sum(is.na(final_data))
## [1] 0

Mengecek data duplikat pada final_data

sum(duplicated(final_data))
## [1] 0

Visualisasi Data

Time Series Plot

time_series_plot <- ggplot(final_data, aes(x = interval)) +
  geom_line(aes(y = mean_client_count, color = "Occupancy")) +
  geom_line(aes(y = total_rate, color = "Energy Consumption")) +
  labs(title = "Time Series Plot of Occcupancy and Energy Consumption",
       x = "Interval", y = "Value", color = "Legend") +
  theme_minimal()
print(time_series_plot)

# Scatter Plot

scatter_plot <- ggplot(final_data, aes(x = mean_client_count, y = total_rate)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Scatter Plot: Occupancy vs Energy Consumption",
       x = "Occupancy", y = "Energy Consumption") +
  theme_minimal()
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'

# Daily Profile

daily_profiles <- final_data %>%
  mutate(date = as.Date(interval),
         hour = hour(interval) + minute(interval)/60) %>%
  group_by(date, hour) %>%
  summarise(occupancy = mean(mean_client_count, na.rm = TRUE),
            energy = mean(total_rate, na.rm = TRUE))
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
daily1 <- ggplot(daily_profiles, aes(x = hour, y = energy, group = date)) +
  geom_line(alpha = 0.3, color = "blue") +
  labs(title = "Daily Energy Profiles (24h)",
       x = "Hour of Day", y = "Energy Consumption") +
  theme_minimal()
print(daily_profiles)
## # A tibble: 3,683 × 4
## # Groups:   date [26]
##    date        hour occupancy energy
##    <date>     <dbl>     <dbl>  <dbl>
##  1 2020-02-01 0         22.8    64.4
##  2 2020-02-01 0.167     20.6    97  
##  3 2020-02-01 0.333     17.2    96  
##  4 2020-02-01 0.5       15.4   100  
##  5 2020-02-01 0.667     12.4    89  
##  6 2020-02-01 0.833     10.5    91  
##  7 2020-02-01 1          9.25   90  
##  8 2020-02-01 1.17       8.67   92  
##  9 2020-02-01 1.33       8.5    87  
## 10 2020-02-01 1.5        8.38   83  
## # ℹ 3,673 more rows
print(daily1)

# Overall Average Daily Profile

# Tambahkan kolom jam-menit dalam sehari
final_data <- final_data %>%
  mutate(time_of_day = format(interval, "%H"))

# Buat profil rata-rata harian
avg_daily <- final_data %>%
  group_by(time_of_day) %>%
  summarise(
    avg_occupancy = mean(mean_client_count, na.rm = TRUE),
    avg_energy = mean(total_rate, na.rm = TRUE)
  )
# Visualisasi profil rata-rata harian
ggplot(avg_daily, aes(x = time_of_day)) +
  geom_line(aes(y = avg_occupancy, color = "Occupancy", group = 1), linewidth = 1) +
  geom_line(aes(y = avg_energy, color = "Energy Consumption", group = 1), linewidth = 1) +
  labs(title = "Overall Average Daily Profile",
       x = "Time of Day", y = "Average Value", color = "Legend") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Profil rata-rata harian berdasarkan weekend vs weekday

# Tambahkan kolom hari
final_data <- final_data %>%
  mutate(
    day_of_week = wday(interval, label = TRUE),  # Hari dalam format factor (Sun, Mon, ...)
    day_type = ifelse(day_of_week %in% c("Sat", "Sun"), "Weekend", "Weekday")
  )
avg_daily_daytype <- final_data %>%
  mutate(hour = hour(interval) + minute(interval)/60) %>%
  group_by(day_type, hour) %>%
  summarise(
    avg_occupancy = mean(mean_client_count, na.rm = TRUE),
    avg_energy = mean(total_rate, na.rm = TRUE)
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'day_type'. You can override using the
## `.groups` argument.

Visualisasi Occupancy

ggplot(avg_daily_daytype, aes(x = hour, y = avg_occupancy, color = day_type)) +
  geom_line(linewidth = 1) +
  labs(title = "Average Occupancy: Weekend vs Weekday",
       x = "Hour of Day", y = "Average Occupancy", color = "Day Type") +
  theme_minimal()

# Visualisasi Energy Consumption

ggplot(avg_daily_daytype, aes(x = hour, y = avg_energy, color = day_type)) +
  geom_line(linewidth = 1) +
  labs(title = "Average Energy Consumption: Weekend vs Weekday",
       x = "Hour of Day", y = "Average Energy Consumption", color = "Day Type") +
  theme_minimal()

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.