Import Library

Load Dataset

wifi <- read.csv("D:/Semester 5/Data Mining & Visualization/file/wifi.csv", header = TRUE)
library1 <- read.csv("D:/Semester 5/Data Mining & Visualization/file/library1.csv", header = TRUE)
library2 <- read.csv("D:/Semester 5/Data Mining & Visualization/file/library2.csv", header = TRUE)
library3 <- read.csv("D:/Semester 5/Data Mining & Visualization/file/library3.csv", header = TRUE)

Exploratory Data

head(wifi)           
##                  time                   Event.Time Associated.Client.Count
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                     184
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                       6
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      18
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      23
## 5 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      45
## 6 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      16
##   Authenticated.Client.Count                   Uni                 Building
## 1                        182 Lancaster University         Graduate College 
## 2                          6 Lancaster University        Management School 
## 3                         18 Lancaster University             SW hse 32-33 
## 4                         23 Lancaster University                SW hse 29 
## 5                         45 Lancaster University            Furness outer 
## 6                         16 Lancaster University   Slaidburn House (LUSU) 
##      Floor
## 1  A Floor
## 2  C Floor
## 3  D Floor
## 4  B floor
## 5  C floor
## 6  E floor
head(library1)        
##                    ts              name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442   KWh    1489442   NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449   KWh    1489449    7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456   KWh    1489456    7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464   KWh    1489464    8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471   KWh    1489471    7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479   KWh    1489479    8
head(library2)        
##                    ts               name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016   KWh    2129016   NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034   KWh    2129034   18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054   KWh    2129054   20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071   KWh    2129071   17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086   KWh    2129086   15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103   KWh    2129103   17
head(library3)       
##                    ts               name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209   KWh    6914209   NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266   KWh    6914266   57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310   KWh    6914310   44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376   KWh    6914376   66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439   KWh    6914439   63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474   KWh    6914474   35
print(dim(wifi))
## [1] 1883844       7
print(dim(library1))
## [1] 18864     6
print(dim(library2))
## [1] 18864     6
print(dim(library3))
## [1] 18864     6
str(wifi)
## 'data.frame':    1883844 obs. of  7 variables:
##  $ time                      : chr  "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
##  $ Event.Time                : chr  "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
##  $ Associated.Client.Count   : int  184 6 18 23 45 16 32 6 73 5 ...
##  $ Authenticated.Client.Count: int  182 6 18 23 45 16 32 6 73 5 ...
##  $ Uni                       : chr  "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
##  $ Building                  : chr  " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
##  $ Floor                     : chr  " A Floor" " C Floor" " D Floor" " B floor" ...
str(library1)
## 'data.frame':    18864 obs. of  6 variables:
##  $ ts        : chr  "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
##  $ name      : chr  "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" ...
##  $ reading   : num  1489442 1489449 1489456 1489464 1489471 ...
##  $ units     : chr  "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num  1489442 1489449 1489456 1489464 1489471 ...
##  $ rate      : num  NA 7 7 8 7 8 7 8 7 8 ...
str(library2)
## 'data.frame':    18864 obs. of  6 variables:
##  $ ts        : chr  "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
##  $ name      : chr  "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" ...
##  $ reading   : num  2129016 2129034 2129054 2129071 2129086 ...
##  $ units     : chr  "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num  2129016 2129034 2129054 2129071 2129086 ...
##  $ rate      : num  NA 18 20 17 15 17 17 17 17 17 ...
str(library3)
## 'data.frame':    18864 obs. of  6 variables:
##  $ ts        : chr  "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
##  $ name      : chr  "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" ...
##  $ reading   : num  6914209 6914266 6914310 6914376 6914439 ...
##  $ units     : chr  "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num  6914209 6914266 6914310 6914376 6914439 ...
##  $ rate      : num  NA 57 44 66 63 35 47 55 34 41 ...
wifi$time <- ymd_hms(wifi$time)
library1$ts <- ymd_hms(library1$ts)
library2$ts <- ymd_hms(library2$ts)
library3$ts <- ymd_hms(library3$ts)
str(wifi)
## 'data.frame':    1883844 obs. of  7 variables:
##  $ time                      : POSIXct, format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
##  $ Event.Time                : chr  "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
##  $ Associated.Client.Count   : int  184 6 18 23 45 16 32 6 73 5 ...
##  $ Authenticated.Client.Count: int  182 6 18 23 45 16 32 6 73 5 ...
##  $ Uni                       : chr  "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
##  $ Building                  : chr  " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
##  $ Floor                     : chr  " A Floor" " C Floor" " D Floor" " B floor" ...
str(library1)
## 'data.frame':    18864 obs. of  6 variables:
##  $ ts        : POSIXct, format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
##  $ name      : chr  "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" ...
##  $ reading   : num  1489442 1489449 1489456 1489464 1489471 ...
##  $ units     : chr  "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num  1489442 1489449 1489456 1489464 1489471 ...
##  $ rate      : num  NA 7 7 8 7 8 7 8 7 8 ...
str(library2)
## 'data.frame':    18864 obs. of  6 variables:
##  $ ts        : POSIXct, format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
##  $ name      : chr  "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" ...
##  $ reading   : num  2129016 2129034 2129054 2129071 2129086 ...
##  $ units     : chr  "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num  2129016 2129034 2129054 2129071 2129086 ...
##  $ rate      : num  NA 18 20 17 15 17 17 17 17 17 ...
str(library3)
## 'data.frame':    18864 obs. of  6 variables:
##  $ ts        : POSIXct, format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
##  $ name      : chr  "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" ...
##  $ reading   : num  6914209 6914266 6914310 6914376 6914439 ...
##  $ units     : chr  "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num  6914209 6914266 6914310 6914376 6914439 ...
##  $ rate      : num  NA 57 44 66 63 35 47 55 34 41 ...
unique(wifi$Building)
##  [1] " Graduate College "                 " Management School "               
##  [3] " SW hse 32-33 "                     " SW hse 29 "                       
##  [5] " Furness outer "                    " Slaidburn House (LUSU) "          
##  [7] " SW hse 34 "                        " Bowland hall "                    
##  [9] " Fylde "                            " SW hse 55-56 "                    
## [11] " SW hse 40-42 "                     " Bowland Twr (Old Bowland Annexe) "
## [13] " Pendle "                           " Faraday and cTAP "                
## [15] " Institute for Advanced Studies "   " University House "                
## [17] " SW hse 53-54 "                     " Engineering "                     
## [19] " Field Station "                    " SW hse 36 "                       
## [21] " Infolab "                          " SW hse 21-23 "                    
## [23] " LICA "                             " FU Hse 71-74 "                    
## [25] " Grizedale "                        " Charles Carter "                  
## [27] " Furness "                          " SW hse 43-45 "                    
## [29] " SW hse 12-16 "                     " Bowland Main "                    
## [31] " SW hse 35 "                        " Human Resources "                 
## [33] " County "                           " FY Hse 65-70 "                    
## [35] " Barker House Farm "                " SW hse 27-28 "                    
## [37] " Bowland Annexe "                   " John Creed "                      
## [39] " grize-res "                        " SW hse 24-26 "                    
## [41] " SW hse 20 "                        " Ruskin Library "                  
## [43] " County South/Cartmel "             " Library "                         
## [45] " Conference Centre "                " Postgrad Stats (PSC) "            
## [47] " Bowland North "                    " George Fox "                      
## [49] " Hse 75 77 "                        " Bailrigg House "                  
## [51] " Sports Centre "                    " SW hse 30-31 "                    
## [53] " Bowland Ash "                      " Alex Square "                     
## [55] " LEC "                              " MDC "                             
## [57] " ISS Building "                     " Chaplaincy Centre "               
## [59] " CETAD "                            " SW hse 50-52 "                    
## [61] " SW hse 17-19 "                     " Central Workshops "               
## [63] " SW hse 46-49 "                     " SW hse 39 "                       
## [65] " Science and technology "           " Whewell "                         
## [67] " SW hse 37-38 "                     " Lonsdale College (SW) "           
## [69] " Physics "                          " County Field "                    
## [71] " Great Hall "                       " SW hse-158-179 "                  
## [73] " Preschool "                        " Reception "                       
## [75] " Hazelrigg "                        " Energy Centre "
wifi_library <- subset(wifi, Building ==  " Library " )
head(wifi_library)
##                    time                   Event.Time Associated.Client.Count
## 74  2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      29
## 158 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                       0
## 235 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      21
## 260 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      38
## 291 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                       0
## 497 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                      33
##     Authenticated.Client.Count                   Uni  Building     Floor
## 74                          28 Lancaster University   Library    A floor
## 158                          0 Lancaster University   Library   LG floor
## 235                         20 Lancaster University   Library    C floor
## 260                         37 Lancaster University   Library    B floor
## 291                          0 Lancaster University   Library   LG floor
## 497                         31 Lancaster University   Library    A floor
colSums(is.na(wifi_library))
##                       time                 Event.Time 
##                          0                          0 
##    Associated.Client.Count Authenticated.Client.Count 
##                          0                          0 
##                        Uni                   Building 
##                          0                          0 
##                      Floor 
##                          0
wifi_library_10min <- wifi_library %>%
  mutate(time = floor_date(time, "10 minutes")) %>%
  group_by(time) %>%
  summarise(mean_count = mean(Associated.Client.Count, na.rm = TRUE)) %>%
  ungroup()
print(wifi_library_10min)
## # A tibble: 3,683 × 2
##    time                mean_count
##    <dttm>                   <dbl>
##  1 2020-02-01 00:00:00      22.8 
##  2 2020-02-01 00:10:00      20.6 
##  3 2020-02-01 00:20:00      17.2 
##  4 2020-02-01 00:30:00      15.4 
##  5 2020-02-01 00:40:00      12.4 
##  6 2020-02-01 00:50:00      10.5 
##  7 2020-02-01 01:00:00       9.25
##  8 2020-02-01 01:10:00       8.67
##  9 2020-02-01 01:20:00       8.5 
## 10 2020-02-01 01:30:00       8.38
## # ℹ 3,673 more rows
# Data yang duplikat
wifi_library_10min[duplicated(wifi_library_10min$time), ]
## # A tibble: 0 × 2
## # ℹ 2 variables: time <dttm>, mean_count <dbl>
library1[duplicated(library1$ts), ]
## [1] ts         name       reading    units      cumulative rate      
## <0 rows> (or 0-length row.names)
library2[duplicated(library2$ts), ]
## [1] ts         name       reading    units      cumulative rate      
## <0 rows> (or 0-length row.names)
library3[duplicated(library3$ts), ]
## [1] ts         name       reading    units      cumulative rate      
## <0 rows> (or 0-length row.names)
# Cek data kosong (NA) di setiap kolom
colSums(is.na(wifi_library_10min))
##       time mean_count 
##          0          0
colSums(is.na(library1))
##         ts       name    reading      units cumulative       rate 
##          0          0       3041          0       3041       3047
colSums(is.na(library2))
##         ts       name    reading      units cumulative       rate 
##          0          0       3041          0       3041       3047
colSums(is.na(library3))
##         ts       name    reading      units cumulative       rate 
##          0          0       3041          0       3041       3047
library1$rate[is.na(library1$rate)] <- mean(library1$rate[1:144], na.rm = TRUE)
library2$rate[is.na(library2$rate)] <- mean(library2$rate[1:144], na.rm = TRUE)
library3$rate[is.na(library3$rate)] <- mean(library3$rate[1:144], na.rm = TRUE)

Data Integration

# Data Integration
energy <- merge(library1, library2, by = "ts")
energy <- merge(energy, library3, by = "ts")

head(energy)
##                    ts            name.x reading.x units.x cumulative.x   rate.x
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048   1489442     KWh      1489442 7.384615
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048   1489449     KWh      1489449 7.000000
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048   1489456     KWh      1489456 7.000000
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048   1489464     KWh      1489464 8.000000
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048   1489471     KWh      1489471 7.000000
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048   1489479     KWh      1489479 8.000000
##               name.y reading.y units.y cumulative.y   rate.y               name
## 1 MC065-L01/M11R2056   2129016     KWh      2129016 17.87413 MC065-L01/M13R2064
## 2 MC065-L01/M11R2056   2129034     KWh      2129034 18.00000 MC065-L01/M13R2064
## 3 MC065-L01/M11R2056   2129054     KWh      2129054 20.00000 MC065-L01/M13R2064
## 4 MC065-L01/M11R2056   2129071     KWh      2129071 17.00000 MC065-L01/M13R2064
## 5 MC065-L01/M11R2056   2129086     KWh      2129086 15.00000 MC065-L01/M13R2064
## 6 MC065-L01/M11R2056   2129103     KWh      2129103 17.00000 MC065-L01/M13R2064
##   reading units cumulative     rate
## 1 6914209   KWh    6914209 39.55944
## 2 6914266   KWh    6914266 57.00000
## 3 6914310   KWh    6914310 44.00000
## 4 6914376   KWh    6914376 66.00000
## 5 6914439   KWh    6914439 63.00000
## 6 6914474   KWh    6914474 35.00000
energy$total_rate <- rowSums(energy[, c("rate.x", "rate.y", "rate")], na.rm = TRUE)
energy <- energy[, c("ts", "total_rate")]
energy$ts <- as.POSIXct(energy$ts)
head(energy)
##                    ts total_rate
## 1 2020-01-01 00:00:00   64.81818
## 2 2020-01-01 00:10:00   82.00000
## 3 2020-01-01 00:20:00   71.00000
## 4 2020-01-01 00:30:00   91.00000
## 5 2020-01-01 00:40:00   85.00000
## 6 2020-01-01 00:50:00   60.00000
ggplot() +
  geom_line(data = wifi_library_10min, aes(x = time, y = mean_count, color = "Occupancy (WiFi)"), alpha = 0.7) +
  geom_line(data = energy, aes(x = ts, y = total_rate, color = "Energy Consumption (kWh)"), alpha = 0.7) +
  labs(title = "Time Series of Library Occupancy vs Energy Consumption",
       x = "Time", y = "Count / kWh", color = "") +
  theme_minimal() +
  theme(legend.position = "bottom")

# Mencari Min Max Occupancy
min_time <- min(wifi_library_10min$time, na.rm = TRUE)
max_time <- max(wifi_library_10min$time, na.rm = TRUE)
#Filter data energy pada rentang min_time - max_time
energy_filtered <- subset(energy, ts >= min_time & ts <= max_time)
head(energy_filtered)
##                       ts total_rate
## 4465 2020-02-01 00:00:00   64.81818
## 4466 2020-02-01 00:10:00   97.00000
## 4467 2020-02-01 00:20:00   96.00000
## 4468 2020-02-01 00:30:00  100.00000
## 4469 2020-02-01 00:40:00   89.00000
## 4470 2020-02-01 00:50:00   91.00000
#Data Integration antara wifi dan energy
df <- data.frame(
  time = wifi_library_10min$time,
  occupancy = wifi_library_10min$mean_count,
  rate = energy_filtered$total_rate
)

df <- na.omit(df)
head(df)
##                  time occupancy      rate
## 1 2020-02-01 00:00:00    22.750  64.81818
## 2 2020-02-01 00:10:00    20.625  97.00000
## 3 2020-02-01 00:20:00    17.250  96.00000
## 4 2020-02-01 00:30:00    15.375 100.00000
## 5 2020-02-01 00:40:00    12.375  89.00000
## 6 2020-02-01 00:50:00    10.500  91.00000

Data Visualization and Analysis

ggplot(df, aes(x = time)) +
  geom_line(aes(y = occupancy, color = "Occupancy (WiFi)"), alpha = 0.7) +
  geom_line(aes(y = rate, color = "Energy Consumption (kWh)"), alpha = 0.7) +
  labs(title = "Time Series of Library Occupancy vs Energy Consumption",
       x = "Time", y = "Count / kWh", color = "") +
  theme_minimal() +
  theme(legend.position = "bottom")

# Scatterplot antara Occupancy vs Rate
ggplot(df, aes(x = occupancy, y = rate)) +
  geom_point(color = "blue", shape = 16, size = 1) +
  labs(x = "Occupancy", y = "Rate") +
  theme_minimal()

# Filter data hanya yang terdapat pada 1 hari full
lastdate <- max(as.Date(df$time))
df <- subset(df, as.Date(time) < lastdate)
df$time <- as.POSIXct(df$time)
df$hour <- as.numeric(format(df$time, "%H")) + as.numeric(format(df$time, "%M"))/60
df$date <- as.Date(df$time)

df$hour_of_day <- as.POSIXct(df$hour * 3600, origin = "1970-01-01", tz = "UTC")
df$hour_of_day <- lubridate::round_date(df$hour_of_day, "10 minutes")

# rata-rata per jam across days
df_daily <- df %>%
  group_by(hour) %>%
  summarise(
    occupancy = mean(occupancy, na.rm = TRUE),
    rate = mean(rate, na.rm = TRUE)
  ) %>%
  ungroup()

df_daily$hour_of_day <- as.POSIXct(df_daily$hour * 3600, origin = "1970-01-01", tz = "UTC")
df_daily$hour_of_day <- lubridate::round_date(df_daily$hour_of_day, "10 minutes")
df_daily
## # A tibble: 144 × 4
##     hour occupancy  rate hour_of_day        
##    <dbl>     <dbl> <dbl> <dttm>             
##  1 0          48.7  128. 1970-01-01 00:00:00
##  2 0.167      44.0  127. 1970-01-01 00:10:00
##  3 0.333      40.4  123. 1970-01-01 00:20:00
##  4 0.5        37.8  124  1970-01-01 00:30:00
##  5 0.667      34.9  124. 1970-01-01 00:40:00
##  6 0.833      31.9  121. 1970-01-01 00:50:00
##  7 1          29.7  121. 1970-01-01 01:00:00
##  8 1.17       27.1  116. 1970-01-01 01:10:00
##  9 1.33       25.9  113. 1970-01-01 01:20:00
## 10 1.5        24.3  111. 1970-01-01 01:30:00
## # ℹ 134 more rows
# Occupancy profile
p1 <- ggplot(df, aes(x = hour_of_day, y = occupancy, group = date)) +
  geom_line(color = "blue", alpha = 0.2) +
  geom_line(data = df_daily, aes(x = hour_of_day, y = occupancy, group = 1),
            color = "red", size = 1) +
  scale_x_datetime(date_labels = "%H:%M:%S") +
  labs(title = "Daily Profiles of Occupancy (01 Feb 2020 - 25 Feb 2020)",
       y = "Occupancy", x = NULL) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Rate profile
p2 <- ggplot(df, aes(x = hour_of_day, y = rate, group = date)) +
  geom_line(color = "green", alpha = 0.2) +
  geom_line(data = df_daily, aes(x = hour_of_day, y = rate, group = 1),
            color = "orange", size = 1) +
  scale_x_datetime(date_labels = "%H:%M:%S") +
  labs(title = "Daily Profiles of Rate (01 Feb 2020 - 25 Feb 2020)",
       y = "Rate", x = "Hour of Day") +
  theme_minimal()

# Gabung dua plot
p1 / p2

# Separate data Weekdays dan Weekend
df$day_of_week <- as.integer(format(df$time, "%w"))   # 0 = Minggu, 6 = Sabtu
df$weekend <- df$day_of_week >= 6 | df$day_of_week == 0
head(df)
##                  time occupancy      rate      hour       date
## 1 2020-02-01 00:00:00    22.750  64.81818 0.0000000 2020-02-01
## 2 2020-02-01 00:10:00    20.625  97.00000 0.1666667 2020-02-01
## 3 2020-02-01 00:20:00    17.250  96.00000 0.3333333 2020-02-01
## 4 2020-02-01 00:30:00    15.375 100.00000 0.5000000 2020-02-01
## 5 2020-02-01 00:40:00    12.375  89.00000 0.6666667 2020-02-01
## 6 2020-02-01 00:50:00    10.500  91.00000 0.8333333 2020-02-01
##           hour_of_day day_of_week weekend
## 1 1970-01-01 00:00:00           6    TRUE
## 2 1970-01-01 00:10:00           6    TRUE
## 3 1970-01-01 00:20:00           6    TRUE
## 4 1970-01-01 00:30:00           6    TRUE
## 5 1970-01-01 00:40:00           6    TRUE
## 6 1970-01-01 00:50:00           6    TRUE
# Rata-rata weekday
df_weekday <- df %>%
  filter(!weekend) %>%
  group_by(hour) %>%
  summarise(
    occupancy = mean(occupancy, na.rm = TRUE),
    rate = mean(rate, na.rm = TRUE)
  ) %>%
  ungroup()

df_weekday$hour_of_day <- as.POSIXct(df_weekday$hour * 3600, origin = "1970-01-01", tz = "UTC")
df_weekday$hour_of_day <- lubridate::round_date(df_weekday$hour_of_day, "10 minutes")

# Rata-rata weekend
df_weekend <- df %>%
  filter(weekend) %>%
  group_by(hour) %>%
  summarise(
    occupancy = mean(occupancy, na.rm = TRUE),
    rate = mean(rate, na.rm = TRUE)
  ) %>%
  ungroup()

df_weekend$hour_of_day <- as.POSIXct(df_weekend$hour * 3600, origin = "1970-01-01", tz = "UTC")
df_weekend$hour_of_day <- lubridate::round_date(df_weekend$hour_of_day, "10 minutes")
head(df_weekend)
## # A tibble: 6 × 4
##    hour occupancy  rate hour_of_day        
##   <dbl>     <dbl> <dbl> <dttm>             
## 1 0          31.1  109. 1970-01-01 00:00:00
## 2 0.167      28.4  112. 1970-01-01 00:10:00
## 3 0.333      25.4  109. 1970-01-01 00:20:00
## 4 0.5        23.8  111. 1970-01-01 00:30:00
## 5 0.667      21.6  109. 1970-01-01 00:40:00
## 6 0.833      19.7  115. 1970-01-01 00:50:00
head(df_weekday)
## # A tibble: 6 × 4
##    hour occupancy  rate hour_of_day        
##   <dbl>     <dbl> <dbl> <dttm>             
## 1 0          56.9  137. 1970-01-01 00:00:00
## 2 0.167      51.3  134. 1970-01-01 00:10:00
## 3 0.333      47.4  129. 1970-01-01 00:20:00
## 4 0.5        44.3  130. 1970-01-01 00:30:00
## 5 0.667      41.1  131. 1970-01-01 00:40:00
## 6 0.833      37.6  124. 1970-01-01 00:50:00
#Visualisasi Weekday vs Weekend

# Occupancy
p_occ <- ggplot() +
  geom_line(data = df_weekday, aes(x = hour_of_day, y = occupancy, color = "Weekday"), size = 1) +
  geom_line(data = df_weekend, aes(x = hour_of_day, y = occupancy, color = "Weekend"), size = 1) +
  scale_x_datetime(date_labels = "%H:%M:%S") +
  scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "red")) +
  labs(title = "Weekday vs Weekend - Occupancy", y = "Occupancy", x = NULL, color = NULL) +
  theme_minimal()

# Rate
p_rate <- ggplot() +
  geom_line(data = df_weekday, aes(x = hour_of_day, y = rate, color = "Weekday"), size = 1) +
  geom_line(data = df_weekend, aes(x = hour_of_day, y = rate, color = "Weekend"), size = 1) +
  scale_x_datetime(date_labels = "%H:%M:%S") +
  scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "red")) +
  labs(title = "Weekday vs Weekend - Rate", y = "Rate", x = "Hour of Day", color = NULL) +
  theme_minimal()

p_occ / p_rate

# Identity Peak Hour
df$hour <- as.integer(format(df$time, "%H"))

occupancy_by_hour <- df %>%
  group_by(hour) %>%
  summarise(occupancy = mean(occupancy, na.rm = TRUE)) %>%
  ungroup()

peak_hours <- occupancy_by_hour %>%
  filter(occupancy == max(occupancy, na.rm = TRUE))

print(peak_hours)
## # A tibble: 1 × 2
##    hour occupancy
##   <int>     <dbl>
## 1    15      370.
ggplot(occupancy_by_hour, aes(x = factor(hour), y = occupancy)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "Barplot Occupancy per Hour", x = "Hour", y = "Occupancy") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10))

#Examine whether occupancy significantly influences energy
correlation <- cor(df$occupancy, df$rate, use = "complete.obs")
print(paste("Korelasi Occupancy vs Energy Consumption (Rate) :", correlation))
## [1] "Korelasi Occupancy vs Energy Consumption (Rate) : 0.877768630777599"
#Highlight cases where energy consumption does not align with occupancy.
df <- df %>%
  mutate(
    z_rate = scale(rate),
    z_occupancy = scale(occupancy),
    anomaly = (abs(z_rate) > 2) & (abs(z_occupancy) < 2)
  )
#Visualisasi scatterplot dan mewarnai berbeda yang anomali
ggplot() +
  geom_point(data = subset(df, !anomaly), aes(x = occupancy, y = rate), color = "blue", alpha = 0.7) +
  geom_point(data = subset(df, anomaly), aes(x = occupancy, y = rate), color = "red", alpha = 0.7) +
  labs(title = "Occupancy vs Rate (Anomaly Highlight)",
       x = "Occupancy", y = "Energy Rate") +
  theme_minimal()