library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readr)
library(stringr)
wifi <- read_csv("C:/Users/ASUS/Downloads/wifi.csv")
## Rows: 1883844 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Event Time, Uni, Building, Floor
## dbl  (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib1 <- read_csv("C:/Users/ASUS/Downloads/library1.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib2 <- read_csv("C:/Users/ASUS/Downloads/library2.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib3 <- read_csv("C:/Users/ASUS/Downloads/library3.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2. Filter Library WiFi

library_wifi_data <- wifi %>%
  filter(str_trim(Building) == "Library") %>% 
  mutate(time = ymd_hms(time))
print(library_wifi_data)
## # A tibble: 28,652 × 7
##    time                `Event Time`                 `Associated Client Count`
##    <dttm>              <chr>                                            <dbl>
##  1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                        29
##  2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                         0
##  3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                        21
##  4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                        38
##  5 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                         0
##  6 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                        33
##  7 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                        22
##  8 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                        39
##  9 2020-02-01 00:12:43 Sat Feb 01 00:12:43 UTC 2020                        30
## 10 2020-02-01 00:12:43 Sat Feb 01 00:12:43 UTC 2020                         0
## # ℹ 28,642 more rows
## # ℹ 4 more variables: `Authenticated Client Count` <dbl>, Uni <chr>,
## #   Building <chr>, Floor <chr>
wifi_resampled <- library_wifi_data %>%
  mutate(time = floor_date(time, "10 minutes")) %>%
  group_by(time) %>%
  summarise(Associated_Client_Count = mean(`Associated Client Count`, na.rm = TRUE),
            .groups = "drop")

cat("WiFi Library (10 menit):\n")
## WiFi Library (10 menit):
print(head(wifi_resampled))
## # A tibble: 6 × 2
##   time                Associated_Client_Count
##   <dttm>                                <dbl>
## 1 2020-02-01 00:00:00                    22.8
## 2 2020-02-01 00:10:00                    20.6
## 3 2020-02-01 00:20:00                    17.2
## 4 2020-02-01 00:30:00                    15.4
## 5 2020-02-01 00:40:00                    12.4
## 6 2020-02-01 00:50:00                    10.5

3. Standarkan kolom ts di lib1/lib2/lib3

print("Struktur timestamp dari masing-masing listrik:")
## [1] "Struktur timestamp dari masing-masing listrik:"
print(str(lib1$ts))
##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL
print(str(lib2$ts))
##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL
print(str(lib3$ts))
##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL

Data Cleaning

lib1 <- lib1 %>%
  mutate(ts = parse_date_time(ts, orders = c("mdy HM", "mdy HMS")))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `ts = parse_date_time(ts, orders = c("mdy HM", "mdy HMS"))`.
## Caused by warning:
## ! All formats failed to parse. No formats found.
print(str(lib1$ts))
##  POSIXct[1:18864], format: NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA ...
## NULL

4. Gabungkan data listrik jadi satu

electricity_df <- bind_rows(lib1, lib2, lib3)

# Imputasi nilai rate yang NA dengan rata-rata 144 data pertama
electricity_df <- electricity_df %>%
  group_by(name) %>%
  mutate(rate = ifelse(is.na(rate),
                       mean(head(rate, 144), na.rm = TRUE),
                       rate)) %>%
  ungroup()

5. Agregasi listrik per 10 menit

total_electricity144 <- electricity_df %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(total_rate = sum(rate, na.rm = TRUE), .groups = "drop") %>%
  rename(time = ts)

cat("Data Listrik per 10 menit:\n")
## Data Listrik per 10 menit:
print(head(total_electricity144))
## # A tibble: 6 × 2
##   time                total_rate
##   <dttm>                   <dbl>
## 1 2020-01-01 00:00:00       57.4
## 2 2020-01-01 00:10:00       75  
## 3 2020-01-01 00:20:00       64  
## 4 2020-01-01 00:30:00       83  
## 5 2020-01-01 00:40:00       78  
## 6 2020-01-01 00:50:00       52

6.Gabungkan WiFi & Listrik

merged_data <- wifi_resampled %>%
  inner_join(total_electricity144, by = "time")

cat("Merged Data:\n")
## Merged Data:
print(head(merged_data))
## # A tibble: 6 × 3
##   time                Associated_Client_Count total_rate
##   <dttm>                                <dbl>      <dbl>
## 1 2020-02-01 00:00:00                    22.8       57.4
## 2 2020-02-01 00:10:00                    20.6       84  
## 3 2020-02-01 00:20:00                    17.2       83  
## 4 2020-02-01 00:30:00                    15.4       89  
## 5 2020-02-01 00:40:00                    12.4       78  
## 6 2020-02-01 00:50:00                    10.5       80

Cek duplikasi

cat("\nJumlah duplikat sebelum dihapus:\n")
## 
## Jumlah duplikat sebelum dihapus:
print(sum(duplicated(total_electricity144)))
## [1] 0

7. Contoh Plot Occupancy vs Electricity

ggplot(merged_data, aes(x = time)) +
  geom_line(aes(y = Associated_Client_Count, color = "Occupancy")) +
  geom_line(aes(y = total_rate, color = "Electricity")) +
  labs(title = "Time Series: Occupancy vs Electricity",
       x = "Time", y = "Value", color = "Legend") +
  theme_minimal()

## 8. Scatter Plot

ggplot(merged_data, aes(x = Associated_Client_Count, y = total_rate)) +
  geom_point(alpha = 0.5, color = 'pink') +
  labs(title = "Scatter: Occupancy vs Electricity",
       x = "Mean Client Count", y = "Total Electricity") +
  theme_minimal()

## 9. Daily Profiles (24h) + average line

# Ambil data dari merged_data, lalu siapkan kolom date dan time_of_day
df <- merged_data %>%
  mutate(
    date = as.Date(time),
    time_of_day = hms::as_hms(time),
    mean_client_count = Associated_Client_Count,
    total_electricity = total_rate
  )

Plot Daily Profile Occupancy

ggplot(df, aes(x = time_of_day, y = mean_client_count, group = date)) +
  geom_line(alpha = 0.2, color = "blue") +   # garis tipis per hari
  stat_summary(fun = mean, geom = "line", aes(group = 1),
               color = "red", linewidth = 1.2) +  # rata-rata harian
  labs(
    title = "Daily Profile of Occupancy",
    x = "Time of Day", y = "Mean Client Count"
  ) +
  theme_minimal()

Plot Daily Profile Electricity

ggplot(df, aes(x = time_of_day, y = total_electricity, group = date)) +
  geom_line(alpha = 0.2, color = "blue") +
  stat_summary(fun = mean, geom = "line", aes(group = 1),
               color = "red", linewidth = 1.2) +
  labs(
    title = "Daily Profile of Electricity Consumption",
    x = "Time of Day", y = "Total Electricity Rate (KWh)"
  ) +
  theme_minimal()

ANALISIS

a. Identify peak hours of occupancy

merged_data <- merged_data %>%
  mutate(Time_of_Day_in_Minutes = hour(time) * 60 + minute(time))

avg_occupancy <- merged_data %>%
  group_by(Time_of_Day_in_Minutes) %>%
  summarise(mean_occupancy = mean(Associated_Client_Count, na.rm = TRUE),
            .groups = "drop")

peak <- avg_occupancy %>%
  filter(mean_occupancy == max(mean_occupancy, na.rm = TRUE))
peak_time_minutes <- peak$Time_of_Day_in_Minutes
peak_value <- peak$mean_occupancy
peak_time <- sprintf("%02d:%02d", peak_time_minutes %/% 60, peak_time_minutes %% 60)

cat("Jam puncak tingkat hunian perpustakaan:", peak_time, 
    "dengan rata-rata:", round(peak_value, 2), "klien\n")
## Jam puncak tingkat hunian perpustakaan: 15:30 dengan rata-rata: 378.84 klien

b. Correlation

correlation <- cor(df$mean_client_count, df$total_electricity, use = "complete.obs")
print("4b. Korelasi Occupancy vs Electricity:")
## [1] "4b. Korelasi Occupancy vs Electricity:"
print(correlation)
## [1] 0.8736819

c. Highlight anomaly cases

print("Kasus di mana listrik tinggi tapi occupancy rendah:")
## [1] "Kasus di mana listrik tinggi tapi occupancy rendah:"
# === Regresi Linear Sederhana ===
model <- lm(total_rate ~ Associated_Client_Count, data = merged_data)
# Prediksi energi (rate) berdasarkan model
merged_data$predicted_energy <- predict(model, newdata = merged_data)
# Hitung residual (selisih aktual - prediksi)
merged_data$residual <- merged_data$total_rate - merged_data$predicted_energy
# Hitung threshold (2x standar deviasi residual)
threshold <- 2 * sd(merged_data$residual, na.rm = TRUE)
# Filter mismatch (nilai residual yang "aneh")
mismatch <- merged_data %>%
  filter(abs(residual) > threshold) %>%
  select(Associated_Client_Count, total_rate, predicted_energy, residual)
# Tampilkan hasil
print(head(mismatch, 20))  # lihat 20 baris pertama
## # A tibble: 20 × 4
##    Associated_Client_Count total_rate predicted_energy residual
##                      <dbl>      <dbl>            <dbl>    <dbl>
##  1                   22.8        57.4             97.7    -40.3
##  2                    6.75       46               93.8    -47.8
##  3                    9          55               94.4    -39.4
##  4                    8.75       52               94.3    -42.3
##  5                    3.75       52               93.1    -41.1
##  6                    3.75       40               93.1    -53.1
##  7                    3.75       43               93.1    -50.1
##  8                    3.5        48               93.0    -45.0
##  9                    5.5        48               93.5    -45.5
## 10                    9.5        51               94.5    -43.5
## 11                  148         167              128.      39.0
## 12                   87.2       154              113.      40.7
## 13                   50         146              104.      41.7
## 14                   44.6       142              103.      39.0
## 15                    7.25       51               93.9    -42.9
## 16                   13.4        52               95.4    -43.4
## 17                   13          41               95.3    -54.3
## 18                   13.2        45               95.4    -50.4
## 19                   10.5        44               94.7    -50.7
## 20                    7.38       49               94.0    -45.0
cat("Jumlah mismatch:", nrow(mismatch), "\n")
## Jumlah mismatch: 97

membuat scatter plot mismatch

# Tandai data mismatch
merged_data <- merged_data %>%
  mutate(flag = ifelse(abs(residual) > threshold, "Mismatch", "Normal"))

ggplot(merged_data, aes(x = Associated_Client_Count, y = total_rate)) +
  geom_point(aes(color = flag), alpha = 0.6) +  # titik normal vs mismatch
  geom_smooth(method = "lm", se = FALSE, color = "black", size = 1) + # garis regresi
  scale_color_manual(values = c("Normal" = "skyblue", "Mismatch" = "red")) +
  labs(
    title = "Associated Client Count vs Energy with Mismatch Highlighted",
    x = "Associated Client Count",
    y = "Energy Consumption",
    color = "Legend"
  ) +
  theme_minimal() +
  theme(legend.position = "top")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

5. Weekend vs Weekday Comparison

Separated weekdays and weekend

merged_tagged <- merged_data %>%
  mutate(
    date = as.Date(time),  # Ekstrak hanya tanggal
    Hour_Minute = format(time, "%H:%M"),
    dow = wday(date, week_start = 1),
    day_type = if_else(dow <= 5, "Weekday (Mon–Fri)", "Weekend (Sat–Sun)")
  )

Agregasi data harian

daily_summary <- merged_tagged %>%
  group_by(date) %>%  # Kelompokkan data berdasarkan tanggal
  slice(1) %>%       # Ambil baris pertama dari setiap kelompok
  ungroup() %>%      # Hapus pengelompokan
  select(date, day_type, time, Associated_Client_Count, total_rate) %>%
  rename(Timestamp = time, 'Associated Client Count' = Associated_Client_Count, rate = total_rate)

# Pisahkan data harian 
weekdays_data <- daily_summary %>% filter(day_type == "Weekday (Mon–Fri)")
weekends_data <- daily_summary %>% filter(day_type == "Weekend (Sat–Sun)")

# Tampilkan hasil
cat("Data Hari Kerja:\n")
## Data Hari Kerja:
print(head(weekdays_data))
## # A tibble: 6 × 5
##   date       day_type          Timestamp           Associated Client Cou…¹  rate
##   <date>     <chr>             <dttm>                                <dbl> <dbl>
## 1 2020-02-03 Weekday (Mon–Fri) 2020-02-03 00:00:00                    39.5   115
## 2 2020-02-04 Weekday (Mon–Fri) 2020-02-04 00:00:00                    46.8   117
## 3 2020-02-05 Weekday (Mon–Fri) 2020-02-05 00:00:00                    55.5   142
## 4 2020-02-06 Weekday (Mon–Fri) 2020-02-06 00:00:00                    32.8    95
## 5 2020-02-07 Weekday (Mon–Fri) 2020-02-07 00:00:00                    53.1   109
## 6 2020-02-10 Weekday (Mon–Fri) 2020-02-10 00:00:00                    45.8    97
## # ℹ abbreviated name: ¹​`Associated Client Count`
cat("\nData Akhir Pekan:\n")
## 
## Data Akhir Pekan:
print(head(weekends_data))
## # A tibble: 6 × 5
##   date       day_type          Timestamp           Associated Client Cou…¹  rate
##   <date>     <chr>             <dttm>                                <dbl> <dbl>
## 1 2020-02-01 Weekend (Sat–Sun) 2020-02-01 00:00:00                    22.8  57.4
## 2 2020-02-02 Weekend (Sat–Sun) 2020-02-02 00:00:00                    29.4 103  
## 3 2020-02-08 Weekend (Sat–Sun) 2020-02-08 00:00:00                    21.2  99  
## 4 2020-02-09 Weekend (Sat–Sun) 2020-02-09 00:00:00                    41   111  
## 5 2020-02-15 Weekend (Sat–Sun) 2020-02-15 00:00:00                    19.4  87  
## 6 2020-02-16 Weekend (Sat–Sun) 2020-02-16 00:00:00                    44.5 101  
## # ℹ abbreviated name: ¹​`Associated Client Count`

Reproduce the previous visualizations for both categories.

1. Time Series Plot: Occupancy dan Electricity

Occupancy Weekday vs Weekend

ggplot(merged_tagged, aes(x = time, y = Associated_Client_Count)) +
  geom_line(color = "blue", alpha = 0.6) +
  facet_wrap(~day_type, ncol = 1, scales = "free_x") +
  labs(title = "Occupancy: Weekday vs Weekend (Split)",
       x = "Time", y = "Client Count") +
  theme_minimal()

Electricity Weekday vs Weekend

ggplot(merged_tagged, aes(x = time, y = total_rate)) +
  geom_line(color = "red", alpha = 0.6) +
  facet_wrap(~day_type, ncol = 1, scales = "free_x") +
  labs(title = "Electricity: Weekday vs Weekend (Split)",
       x = "Time", y = "Total Electricity") +
  theme_minimal()

2. Scatter Weekday vs Weekend

ggplot(merged_tagged, aes(x = Associated_Client_Count, y = total_rate)) +
  geom_point(alpha = 0.5, color = "blue") +
  facet_wrap(~day_type) +
  labs(title = "Scatter: Occupancy vs Electricity (Split by Weekday/Weekend)",
       x = "Client Count", y = "Total Electricity") +
  theme_minimal()

3. Hitung rata-rata weekday vs weekend

avg_profile <- merged_tagged %>%
  group_by(day_type, Hour_Minute) %>%
  summarise(
    avg_clients = mean(Associated_Client_Count, na.rm = TRUE),
    avg_rate = mean(total_rate, na.rm = TRUE),
    .groups = "drop"
  )

4. Plot Occupancy rata-rata weekday vs weekend

avg_profile <- avg_profile %>%
  mutate(Hour_Minute = hms::as_hms(paste0(Hour_Minute, ":00")))

Plot ulang

ggplot(avg_profile, aes(x = Hour_Minute, y = avg_clients, color = day_type, group = day_type)) +
  geom_line(size = 1) +
  labs(title = "Perbandingan Rata-rata Harian: Occupancy",
       x = "Jam dalam Sehari", y = "Jumlah Klien Terhubung",
       color = "Jenis Hari") +
  scale_x_time(
    breaks = hms::as_hms(c("00:00:00", "06:00:00", "12:00:00", "18:00:00", "23:59:59")),
    labels = c("00:00", "06:00", "12:00", "18:00", "24:00")
  ) +
  theme_minimal()

5. Plot Electricity rata-rata weekday vs weekend

ggplot(avg_profile, aes(x = Hour_Minute, y = avg_rate, color = day_type, group = day_type)) +
  geom_line(size = 1) +
  labs(title = "Perbandingan Rata-rata Harian: Electricity",
       x = "Jam dalam Sehari", y = "Tingkat Listrik (KWh)",
       color = "Jenis Hari") +
  scale_x_time(
    breaks = hms::as_hms(c("00:00:00", "06:00:00", "12:00:00", "18:00:00", "23:59:59")),
    labels = c("00:00", "06:00", "12:00", "18:00", "24:00")
  ) +
  theme_minimal()