library(readr)
library(lubridate)
library(dplyr)    
library(magrittr)
library(ggplot2)
library(corrplot)
library(tidyr)
library(cowplot) # For arranging plots

2. Loading data

#import file
data_wifi <- read_csv("C:/Users/litat/Downloads/wifi.csv")
data_library1 <- read_csv("C:/Users/litat/Downloads/library1.csv")
data_library2 <- read_csv("C:/Users/litat/Downloads/library2.csv")
data_library3 <- read_csv("C:/Users/litat/Downloads/library3.csv")
#WIFI
#kolom di file wifi
colnames(data_wifi)
## [1] "time"                       "Event Time"                
## [3] "Associated Client Count"    "Authenticated Client Count"
## [5] "Uni"                        "Building"                  
## [7] "Floor"
#data di kolom Building
unique(data_wifi$Building)
##  [1] "Graduate College"                 "Management School"               
##  [3] "SW hse 32-33"                     "SW hse 29"                       
##  [5] "Furness outer"                    "Slaidburn House (LUSU)"          
##  [7] "SW hse 34"                        "Bowland hall"                    
##  [9] "Fylde"                            "SW hse 55-56"                    
## [11] "SW hse 40-42"                     "Bowland Twr (Old Bowland Annexe)"
## [13] "Pendle"                           "Faraday and cTAP"                
## [15] "Institute for Advanced Studies"   "University House"                
## [17] "SW hse 53-54"                     "Engineering"                     
## [19] "Field Station"                    "SW hse 36"                       
## [21] "Infolab"                          "SW hse 21-23"                    
## [23] "LICA"                             "FU Hse 71-74"                    
## [25] "Grizedale"                        "Charles Carter"                  
## [27] "Furness"                          "SW hse 43-45"                    
## [29] "SW hse 12-16"                     "Bowland Main"                    
## [31] "SW hse 35"                        "Human Resources"                 
## [33] "County"                           "FY Hse 65-70"                    
## [35] "Barker House Farm"                "SW hse 27-28"                    
## [37] "Bowland Annexe"                   "John Creed"                      
## [39] "grize-res"                        "SW hse 24-26"                    
## [41] "SW hse 20"                        "Ruskin Library"                  
## [43] "County South/Cartmel"             "Library"                         
## [45] "Conference Centre"                "Postgrad Stats (PSC)"            
## [47] "Bowland North"                    "George Fox"                      
## [49] "Hse 75 77"                        "Bailrigg House"                  
## [51] "Sports Centre"                    "SW hse 30-31"                    
## [53] "Bowland Ash"                      "Alex Square"                     
## [55] "LEC"                              "MDC"                             
## [57] "ISS Building"                     "Chaplaincy Centre"               
## [59] "CETAD"                            "SW hse 50-52"                    
## [61] "SW hse 17-19"                     "Central Workshops"               
## [63] "SW hse 46-49"                     "SW hse 39"                       
## [65] "Science and technology"           "Whewell"                         
## [67] "SW hse 37-38"                     "Lonsdale College (SW)"           
## [69] "Physics"                          "County Field"                    
## [71] "Great Hall"                       "SW hse-158-179"                  
## [73] "Preschool"                        "Reception"                       
## [75] "Hazelrigg"                        "Energy Centre"
#filter building=library
wifi_library <- data_wifi[data_wifi$Building == "Library", ]
head(wifi_library)
## # A tibble: 6 × 7
##   time          `Event Time` Associated Client Co…¹ Authenticated Client…² Uni  
##   <chr>         <chr>                         <dbl>                  <dbl> <chr>
## 1 2/1/2020 0:02 Sat Feb 01 …                     29                     28 Lanc…
## 2 2/1/2020 0:02 Sat Feb 01 …                      0                      0 Lanc…
## 3 2/1/2020 0:02 Sat Feb 01 …                     21                     20 Lanc…
## 4 2/1/2020 0:02 Sat Feb 01 …                     38                     37 Lanc…
## 5 2/1/2020 0:07 Sat Feb 01 …                      0                      0 Lanc…
## 6 2/1/2020 0:07 Sat Feb 01 …                     33                     31 Lanc…
## # ℹ abbreviated names: ¹​`Associated Client Count`,
## #   ²​`Authenticated Client Count`
## # ℹ 2 more variables: Building <chr>, Floor <chr>
str(wifi_library)
## tibble [15,948 × 7] (S3: tbl_df/tbl/data.frame)
##  $ time                      : chr [1:15948] "2/1/2020 0:02" "2/1/2020 0:02" "2/1/2020 0:02" "2/1/2020 0:02" ...
##  $ Event Time                : chr [1:15948] "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
##  $ Associated Client Count   : num [1:15948] 29 0 21 38 0 33 22 39 30 0 ...
##  $ Authenticated Client Count: num [1:15948] 28 0 20 37 0 31 20 37 27 0 ...
##  $ Uni                       : chr [1:15948] "Lancaster University" "Lancaster University" "Lancaster University" "Lancaster University" ...
##  $ Building                  : chr [1:15948] "Library" "Library" "Library" "Library" ...
##  $ Floor                     : chr [1:15948] "A floor" "LG floor" "C floor" "B floor" ...
#ubah ke datetime
wifi_library$time <- as.POSIXct(wifi_library$time, format = "%m/%d/%Y %H:%M")
#interval 10 menit
wifi_library <- wifi_library %>%
  mutate(time = floor_date(time, "10 minutes")) %>%
  group_by(time) %>%                         # kelompokkan per 10 menit
  summarise(across(where(is.numeric), mean, na.rm = TRUE), .groups = "drop")

3.2 Library Energy Dataset

#ENERGY
#cek type data
str(data_library1)
## spc_tbl_ [18,864 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ts        : chr [1:18864] "1/1/2020 0:00" "1/1/2020 0:10" "1/1/2020 0:20" "1/1/2020 0:30" ...
##  $ name      : chr [1:18864] "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" ...
##  $ reading   : num [1:18864] 1489442 1489449 1489456 1489464 1489471 ...
##  $ units     : chr [1:18864] "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num [1:18864] 1489442 1489449 1489456 1489464 1489471 ...
##  $ rate      : num [1:18864] NA 7 7 8 7 8 7 8 7 8 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ts = col_character(),
##   ..   name = col_character(),
##   ..   reading = col_double(),
##   ..   units = col_character(),
##   ..   cumulative = col_double(),
##   ..   rate = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(data_library2)
## spc_tbl_ [18,864 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ts        : POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
##  $ name      : chr [1:18864] "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" ...
##  $ reading   : num [1:18864] 2129016 2129034 2129054 2129071 2129086 ...
##  $ units     : chr [1:18864] "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num [1:18864] 2129016 2129034 2129054 2129071 2129086 ...
##  $ rate      : num [1:18864] NA 18 20 17 15 17 17 17 17 17 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ts = col_datetime(format = ""),
##   ..   name = col_character(),
##   ..   reading = col_double(),
##   ..   units = col_character(),
##   ..   cumulative = col_double(),
##   ..   rate = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(data_library3)
## spc_tbl_ [18,864 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ts        : POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
##  $ name      : chr [1:18864] "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" ...
##  $ reading   : num [1:18864] 6914209 6914266 6914310 6914376 6914439 ...
##  $ units     : chr [1:18864] "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num [1:18864] 6914209 6914266 6914310 6914376 6914439 ...
##  $ rate      : num [1:18864] NA 57 44 66 63 35 47 55 34 41 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ts = col_datetime(format = ""),
##   ..   name = col_character(),
##   ..   reading = col_double(),
##   ..   units = col_character(),
##   ..   cumulative = col_double(),
##   ..   rate = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
#ubah ke datetime
data_library1$ts <- as.POSIXct(data_library1$ts, format = "%m/%d/%Y %H:%M")
data_library2$ts <- as.POSIXct(data_library2$ts, format = "%m/%d/%Y %H:%M")
data_library3$ts <- as.POSIXct(data_library3$ts, format = "%m/%d/%Y %H:%M")
#rename kolom ts
names(data_library1)[names(data_library1) == "ts"]  <- "time"
names(data_library2)[names(data_library2) == "ts"]  <- "time"
names(data_library3)[names(data_library3) == "ts"]  <- "time"
#gabungin file library
gabung12 <- merge(data_library1, data_library2, by = "time", all = TRUE)  # full join
all_data_library <- merge(gabung12, data_library3, by = "time", all = TRUE)
head(all_data_library)
##                  time            name.x reading.x units.x cumulative.x rate.x
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048   1489442     KWh      1489442     NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048   1489449     KWh      1489449      7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048   1489456     KWh      1489456      7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048   1489464     KWh      1489464      8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048   1489471     KWh      1489471      7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048   1489479     KWh      1489479      8
##               name.y reading.y units.y cumulative.y rate.y               name
## 1 MC065-L01/M11R2056   2129016     KWh      2129016     NA MC065-L01/M13R2064
## 2 MC065-L01/M11R2056   2129034     KWh      2129034     18 MC065-L01/M13R2064
## 3 MC065-L01/M11R2056   2129054     KWh      2129054     20 MC065-L01/M13R2064
## 4 MC065-L01/M11R2056   2129071     KWh      2129071     17 MC065-L01/M13R2064
## 5 MC065-L01/M11R2056   2129086     KWh      2129086     15 MC065-L01/M13R2064
## 6 MC065-L01/M11R2056   2129103     KWh      2129103     17 MC065-L01/M13R2064
##   reading units cumulative rate
## 1 6914209   KWh    6914209   NA
## 2 6914266   KWh    6914266   57
## 3 6914310   KWh    6914310   44
## 4 6914376   KWh    6914376   66
## 5 6914439   KWh    6914439   63
## 6 6914474   KWh    6914474   35
energy <- all_data_library %>%
  select(time, rate.x, rate.y, rate) %>%          # ambil kolom yang diinginkan
  mutate(total_rate = rate.x + rate.y + rate)     # jumlahkan
head(energy)
##                  time rate.x rate.y rate total_rate
## 1 2020-01-01 00:00:00     NA     NA   NA         NA
## 2 2020-01-01 00:10:00      7     18   57         82
## 3 2020-01-01 00:20:00      7     20   44         71
## 4 2020-01-01 00:30:00      8     17   66         91
## 5 2020-01-01 00:40:00      7     15   63         85
## 6 2020-01-01 00:50:00      8     17   35         60
#gabungin wifi dan energy
final_data<- merge(wifi_library, energy, by = "time", all = TRUE)
head(final_data)
##                  time Associated Client Count Authenticated Client Count rate.x
## 1 2020-01-01 00:00:00                      NA                         NA     NA
## 2 2020-01-01 00:10:00                      NA                         NA      7
## 3 2020-01-01 00:20:00                      NA                         NA      7
## 4 2020-01-01 00:30:00                      NA                         NA      8
## 5 2020-01-01 00:40:00                      NA                         NA      7
## 6 2020-01-01 00:50:00                      NA                         NA      8
##   rate.y rate total_rate
## 1     NA   NA         NA
## 2     18   57         82
## 3     20   44         71
## 4     17   66         91
## 5     15   63         85
## 6     17   35         60
write_csv(final_data, "C:/Users/litat/Downloads/final_data.csv")
# =============== BAGIAN 2: DATA CLEANING ===============
# 1. Tangani Missing Values pada Energi
#    → isi NA dengan rata-rata dari 144 observasi pertama untuk tiap meter
for (col in c("rate.x", "rate.y", "rate")) {
  if (col %in% names(final_data)) {
    mean_144 <- mean(final_data[[col]][1:144], na.rm = TRUE)
    final_data[[col]][is.na(final_data[[col]])] <- mean_144
  }
}
# Energi total juga diperiksa
if ("total_rate" %in% names(final_data)) {
  mean_144_total <- mean(final_data$total_rate[1:144], na.rm = TRUE)
  final_data$total_rate[is.na(final_data$total_rate)] <- mean_144_total
}
# 2. Standarisasi Timestamp (kolom time)
final_data$time <- as.POSIXct(final_data$time, format="%Y-%m-%d %H:%M:%S", tz="UTC")
# 3. Hapus baris duplikat berdasarkan kolom time
final_data <- final_data[!duplicated(final_data$time), ]
# 4. Urutkan data berdasarkan waktu
final_data <- final_data[order(final_data$time), ]
# 5. Drop NA
final_data <- final_data %>% drop_na()
# 6. Simpan hasil cleaning
write_csv(final_data, "C:/Users/litat/Downloads/cleaned_library_data.csv")
# Cek hasil
head(final_data)
##                  time Associated Client Count Authenticated Client Count
## 1 2020-01-31 17:00:00                  22.750                     21.625
## 2 2020-01-31 17:10:00                  20.625                     19.250
## 3 2020-01-31 17:20:00                  17.250                     17.000
## 4 2020-01-31 17:30:00                  15.375                     15.250
## 5 2020-01-31 17:40:00                  12.375                     12.000
## 6 2020-01-31 17:50:00                  10.500                     10.500
##      rate.x   rate.y     rate total_rate
## 1  7.384615 17.87413 39.55944   64.81818
## 2 13.000000 11.00000 73.00000   97.00000
## 3 13.000000 11.00000 72.00000   96.00000
## 4 11.000000 15.00000 74.00000  100.00000
## 5 11.000000 13.00000 65.00000   89.00000
## 6 11.000000 16.00000 64.00000   91.00000

4.1 Time Series Plot of Occupancy and Electricity Consumption

# ---- VISUALISASI ----

# 1. Time Series 
# TIME SERIES - OCCUPANCY
ggplot(final_data, aes(x = time, y = `Associated Client Count`)) +
  geom_line(color = "blue", linewidth = 0.8) +
  labs(title = "Time Series of Occupancy in Library",
       x = "Time", 
       y = "Occupancy (Associated Clients)") +
  theme_minimal()

# TIME SERIES - ENERGY
ggplot(final_data, aes(x = time, y = total_rate)) +
  geom_line(color = "red", linewidth = 0.8) +
  labs(title = "Time Series of Energy Consumption in Library",
       x = "Time",
       y = "Energy Consumption (kWh)") +
  theme_minimal()

# 2. SCATTER PLOT
ggplot(final_data, aes(x = `Associated Client Count`, y = total_rate)) +
  geom_point(alpha = 0.4, color = "darkgreen") +
  geom_smooth(method = "lm", se = TRUE, color = "pink") +
  labs(title = "Scatter Plot: Occupancy vs Energy Consumption",
       x = "Occupancy (WiFi Clients)", 
       y = "Energy Consumption (kWh)") +
  theme_minimal()

# 3. DAILY PROFILES
# Tambahkan kolom jam & tanggal
final_data <- final_data %>%
  mutate(hour = hour(time),
         date = as.Date(time))

# Plot siklus harian occupancy
ggplot(final_data, aes(x = hour, y = `Associated Client Count`, group = date)) +
  geom_line(alpha = 0.7, color = "black") +
  labs(title = "Daily Occupancy Profiles (per Day)",
       x = "Hour of Day", 
       y = "Occupancy (WiFi Clients)") +
  theme_minimal()

# Plot siklus harian energi
ggplot(final_data, aes(x = hour, y = total_rate, group = date)) +
  geom_line(alpha = 0.7, color = "maroon") +
  labs(title = "Daily Energy Consumption Profiles (per Day)",
       x = "Hour of Day", 
       y = "Energy Consumption (kWh)") +
  theme_minimal()

# AVERAGE DAILY PROFILE
# Hitung rata-rata per jam (gabungan semua hari)
daily_avg <- final_data %>%
  group_by(hour) %>%
  summarise(avg_occ = mean(`Associated Client Count`, na.rm = TRUE),
            avg_energy = mean(total_rate, na.rm = TRUE))

# Plot Okupansi Rata-rata
p_avg_occ <- ggplot(daily_avg, aes(x = hour, y = avg_occ)) +
  geom_line(color = "orange", linewidth = 1.2) +
  geom_point(color = "orange", size = 2) +
  labs(title = "Average Daily Profile",
       x = "hour", 
       y = "Average Occupancy") +
  theme_minimal()

# Plot Energi Rata-rata
p_avg_energy <- ggplot(daily_avg, aes(x = hour, y = avg_energy)) +
  geom_line(color = "purple", linewidth = 1.2) +
  geom_point(color = "purple", size = 2) +
  labs(x = "hour", 
       y = "Average Energy Consumption (kWh)") +
  theme_minimal()

# Gabungkan kedua plot di atas satu sama lain
plot_grid(p_avg_occ, p_avg_energy, ncol = 1, align = 'v')

# 5. Analisis ## 5.1 Peak hours of occupancy

# ---- Analisis ----

# 1. MENENTUKAN JAM PUNCAK KUNJUNGAN

# Mengurutkan data rata-rata kunjungan per jam dari yang tertinggi
peak_hours <- daily_avg %>%
  arrange(desc(avg_occ))

# Menampilkan 5 jam tersibuk
print("Lima jam puncak kunjungan:")
## [1] "Lima jam puncak kunjungan:"
print(head(peak_hours, 5))
## # A tibble: 5 × 3
##    hour avg_occ avg_energy
##   <int>   <dbl>      <dbl>
## 1     8    362.       190.
## 2     7    355.       188.
## 3     9    334.       190.
## 4     6    330.       187.
## 5     5    298.       181.
# Visualisasi jam puncak dengan bar chart untuk lebih jelas
ggplot(daily_avg, aes(x = reorder(hour, -avg_occ), y = avg_occ)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = round(avg_occ, 1)), vjust = -0.5, size = 3.5) +
  labs(title = "Rata-Rata Kunjungan per Jam (Jam Puncak)",
       x = "Jam dalam Sehari",
       y = "Rata-Rata Jumlah Pengunjung (Klien WiFi)") +
  theme_minimal()

# 2. MELIHAT PENGARUH JUMLAH KUNJUNGAN TERHADAP KONSUMSI LISTRIK

# Menghitung koefisien korelasi Pearson
# 'use = "complete.obs"' digunakan untuk mengabaikan baris dengan nilai NA
correlation_value <- cor(final_data$`Associated Client Count`, final_data$total_rate, use = "complete.obs")

# Menampilkan hasil korelasi
print(paste("Koefisien korelasi antara jumlah pengunjung dan konsumsi energi adalah:", round(correlation_value, 4)))
## [1] "Koefisien korelasi antara jumlah pengunjung dan konsumsi energi adalah: 0.8675"
# 3. MENCARI KONDISI ANOMALI

# Menentukan batas kuantil (persentil)
# Q3 untuk nilai "tinggi" (75%) dan Q1 untuk nilai "rendah" (25%)
high_energy_threshold <- quantile(final_data$total_rate, 0.75, na.rm = TRUE)
low_occupancy_threshold <- quantile(final_data$`Associated Client Count`, 0.25, na.rm = TRUE)

high_occupancy_threshold <- quantile(final_data$`Associated Client Count`, 0.75, na.rm = TRUE)
low_energy_threshold <- quantile(final_data$total_rate, 0.25, na.rm = TRUE)


# Skenario 1: Listrik tinggi, padahal orang sedikit
high_energy_low_occ <- final_data %>%
  filter(total_rate > high_energy_threshold & `Associated Client Count` < low_occupancy_threshold)

print("Kondisi Anomali: Listrik Tinggi, Pengunjung Sedikit")
## [1] "Kondisi Anomali: Listrik Tinggi, Pengunjung Sedikit"
print(head(high_energy_low_occ))
## [1] time                       Associated Client Count   
## [3] Authenticated Client Count rate.x                    
## [5] rate.y                     rate                      
## [7] total_rate                 hour                      
## [9] date                      
## <0 rows> (or 0-length row.names)
# Skenario 2: Listrik rendah, padahal orang banyak (efisiensi energi)
low_energy_high_occ <- final_data %>%
  filter(total_rate < low_energy_threshold & `Associated Client Count` > high_occupancy_threshold)

print("Kondisi Anomali: Listrik Rendah, Pengunjung Banyak")
## [1] "Kondisi Anomali: Listrik Rendah, Pengunjung Banyak"
print(head(low_energy_high_occ))
## [1] time                       Associated Client Count   
## [3] Authenticated Client Count rate.x                    
## [5] rate.y                     rate                      
## [7] total_rate                 hour                      
## [9] date                      
## <0 rows> (or 0-length row.names)
# Visualisasi anomali pada scatter plot
ggplot(final_data, aes(x = `Associated Client Count`, y = total_rate)) +
  geom_point(alpha = 0.3, color = "grey") + # Data asli sebagai background
  geom_point(data = high_energy_low_occ, aes(color = "Listrik Tinggi, Org Sedikit"), size = 2.5) +
  geom_point(data = low_energy_high_occ, aes(color = "Listrik Rendah, Org Banyak"), size = 2.5) +
  labs(title = "Identifikasi Anomali pada Scatter Plot",
       x = "Jumlah Pengunjung (Klien WiFi)",
       y = "Konsumsi Energi (kWh)",
       color = "Kondisi Anomali") +
  theme_minimal()

# — 6. Weekend vs Weekday Comparison —

#WEEKDAY VS WEEKEND
final_data <- final_data %>%
  mutate(
    date = as.Date(time),
    hour = hour(time),
    weekday_num = wday(date, week_start = 1),            # 1=Mon … 7=Sun
    weekday = wday(date, label = TRUE, abbr = FALSE, week_start = 1),
    day_type = if_else(weekday_num %in% c(6,7), "Weekend", "Weekday") %>%
      factor(levels = c("Weekday","Weekend"))
  )

# jumlah pengunjung & energi per hari
daily_counts <- final_data %>%
  group_by(date, day_type) %>%
  summarise(
    occ = mean(`Associated Client Count`, na.rm = TRUE),
    energy = mean(total_rate, na.rm = TRUE),
    .groups = "drop"
  )

# rata-rata per jam
hourly_avg <- final_data %>%
  group_by(hour, day_type) %>%
  summarise(
    avg_occ = mean(`Associated Client Count`, na.rm = TRUE),
    avg_energy = mean(total_rate, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(daily_counts, aes(x = date, y = occ, color = day_type)) +
  geom_line() +
  labs(title = "Occupancy per Day: Weekday vs Weekend",
       x = "Date", y = "Occupancy") +
  theme_minimal()

ggplot(daily_counts, aes(x = date, y = energy, color = day_type)) +
  geom_line() +
  labs(title = "Energy per Day: Weekday vs Weekend",
       x = "Date", y = "Energy (kWh)") +
  theme_minimal()

ggplot(hourly_avg, aes(x = hour, y = avg_occ, color = day_type)) +
  geom_line(linewidth = 1.1) +
  geom_point() +
  scale_x_continuous(breaks = 0:23) +
  labs(title = "Average Occupancy per Hour",
       x = "Hour", y = "Avg Occupancy") +
  theme_minimal()

ggplot(hourly_avg, aes(x = hour, y = avg_energy, color = day_type)) +
  geom_line(linewidth = 1.1) +
  geom_point() +
  scale_x_continuous(breaks = 0:23) +
  labs(title = "Average Energy per Hour",
       x = "Hour", y = "Avg Energy (kWh)") +
  theme_minimal()

heat_occ <- final_data %>%
  group_by(weekday, hour, day_type) %>%
  summarise(avg_occ = mean(`Associated Client Count`, na.rm = TRUE), .groups = "drop")

ggplot(heat_occ, aes(x = hour, y = weekday, fill = avg_occ)) +
  geom_tile() +
  facet_wrap(~day_type) +
  scale_x_continuous(breaks = 0:23) +
  labs(title = "Heatmap Occupancy", x = "Hour", y = "Day") +
  theme_minimal()

# rata-rata occupancy per hari
weekday_occ <- daily_counts %>% filter(day_type=="Weekday") %>% pull(occ)
weekend_occ <- daily_counts %>% filter(day_type=="Weekend") %>% pull(occ)

t.test(weekday_occ, weekend_occ)  # kalau datanya normal
## 
##  Welch Two Sample t-test
## 
## data:  weekday_occ and weekend_occ
## t = 3.9403, df = 12.2, p-value = 0.001902
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   30.45667 105.49457
## sample estimates:
## mean of x mean of y 
## 154.10207  86.12645
# kalau data tidak normal: wilcox.test(weekday_occ, weekend_occ)

# distribusi jam occupancy
tab_hour <- final_data %>%
  mutate(hour = factor(hour)) %>%
  count(day_type, hour) %>%
  pivot_wider(names_from = day_type, values_from = n, values_fill = 0)

chisq.test(as.matrix(tab_hour[, -1]))
## 
##  Pearson's Chi-squared test
## 
## data:  as.matrix(tab_hour[, -1])
## X-squared = 0.71355, df = 23, p-value = 1