library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readr)
library(stringr)
wifi <- read_csv("C:/Users/ASUS/Downloads/wifi.csv")
## Rows: 1883844 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Event Time, Uni, Building, Floor
## dbl (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib1 <- read_csv("C:/Users/ASUS/Downloads/library1.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib2 <- read_csv("C:/Users/ASUS/Downloads/library2.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib3 <- read_csv("C:/Users/ASUS/Downloads/library3.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library_wifi_data <- wifi %>%
filter(str_trim(Building) == "Library") %>%
mutate(time = ymd_hms(time))
print(library_wifi_data)
## # A tibble: 28,652 × 7
## time `Event Time` `Associated Client Count`
## <dttm> <chr> <dbl>
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 29
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 0
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 21
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 38
## 5 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020 0
## 6 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020 33
## 7 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020 22
## 8 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020 39
## 9 2020-02-01 00:12:43 Sat Feb 01 00:12:43 UTC 2020 30
## 10 2020-02-01 00:12:43 Sat Feb 01 00:12:43 UTC 2020 0
## # ℹ 28,642 more rows
## # ℹ 4 more variables: `Authenticated Client Count` <dbl>, Uni <chr>,
## # Building <chr>, Floor <chr>
wifi_resampled <- library_wifi_data %>%
mutate(time = floor_date(time, "10 minutes")) %>%
group_by(time) %>%
summarise(Associated_Client_Count = mean(`Associated Client Count`, na.rm = TRUE),
.groups = "drop")
cat("WiFi Library (10 menit):\n")
## WiFi Library (10 menit):
print(head(wifi_resampled))
## # A tibble: 6 × 2
## time Associated_Client_Count
## <dttm> <dbl>
## 1 2020-02-01 00:00:00 22.8
## 2 2020-02-01 00:10:00 20.6
## 3 2020-02-01 00:20:00 17.2
## 4 2020-02-01 00:30:00 15.4
## 5 2020-02-01 00:40:00 12.4
## 6 2020-02-01 00:50:00 10.5
print("Struktur timestamp dari masing-masing listrik:")
## [1] "Struktur timestamp dari masing-masing listrik:"
print(str(lib1$ts))
## POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL
print(str(lib2$ts))
## POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL
print(str(lib3$ts))
## POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL
lib1 <- lib1 %>%
mutate(ts = parse_date_time(ts, orders = c("mdy HM", "mdy HMS")))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `ts = parse_date_time(ts, orders = c("mdy HM", "mdy HMS"))`.
## Caused by warning:
## ! All formats failed to parse. No formats found.
print(str(lib1$ts))
## POSIXct[1:18864], format: NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA ...
## NULL
electricity_df <- bind_rows(lib1, lib2, lib3)
# Imputasi nilai rate yang NA dengan rata-rata 144 data pertama
electricity_df <- electricity_df %>%
group_by(name) %>%
mutate(rate = ifelse(is.na(rate),
mean(head(rate, 144), na.rm = TRUE),
rate)) %>%
ungroup()
total_electricity144 <- electricity_df %>%
mutate(ts = floor_date(ts, "10 minutes")) %>%
group_by(ts) %>%
summarise(total_rate = sum(rate, na.rm = TRUE), .groups = "drop") %>%
rename(time = ts)
cat("Data Listrik per 10 menit:\n")
## Data Listrik per 10 menit:
print(head(total_electricity144))
## # A tibble: 6 × 2
## time total_rate
## <dttm> <dbl>
## 1 2020-01-01 00:00:00 57.4
## 2 2020-01-01 00:10:00 75
## 3 2020-01-01 00:20:00 64
## 4 2020-01-01 00:30:00 83
## 5 2020-01-01 00:40:00 78
## 6 2020-01-01 00:50:00 52
merged_data <- wifi_resampled %>%
inner_join(total_electricity144, by = "time")
cat("Merged Data:\n")
## Merged Data:
print(head(merged_data))
## # A tibble: 6 × 3
## time Associated_Client_Count total_rate
## <dttm> <dbl> <dbl>
## 1 2020-02-01 00:00:00 22.8 57.4
## 2 2020-02-01 00:10:00 20.6 84
## 3 2020-02-01 00:20:00 17.2 83
## 4 2020-02-01 00:30:00 15.4 89
## 5 2020-02-01 00:40:00 12.4 78
## 6 2020-02-01 00:50:00 10.5 80
cat("\nJumlah duplikat sebelum dihapus:\n")
##
## Jumlah duplikat sebelum dihapus:
print(sum(duplicated(total_electricity144)))
## [1] 0
ggplot(merged_data, aes(x = time)) +
geom_line(aes(y = Associated_Client_Count, color = "Occupancy")) +
geom_line(aes(y = total_rate, color = "Electricity")) +
labs(title = "Time Series: Occupancy vs Electricity",
x = "Time", y = "Value", color = "Legend") +
theme_minimal()
## 8. Scatter Plot
ggplot(merged_data, aes(x = Associated_Client_Count, y = total_rate)) +
geom_point(alpha = 0.5, color = 'pink') +
labs(title = "Scatter: Occupancy vs Electricity",
x = "Mean Client Count", y = "Total Electricity") +
theme_minimal()
## 9. Daily Profiles (24h) + average line
# Ambil data dari merged_data, lalu siapkan kolom date dan time_of_day
df <- merged_data %>%
mutate(
date = as.Date(time),
time_of_day = hms::as_hms(time),
mean_client_count = Associated_Client_Count,
total_electricity = total_rate
)
ggplot(df, aes(x = time_of_day, y = mean_client_count, group = date)) +
geom_line(alpha = 0.2, color = "blue") + # garis tipis per hari
stat_summary(fun = mean, geom = "line", aes(group = 1),
color = "red", linewidth = 1.2) + # rata-rata harian
labs(
title = "Daily Profile of Occupancy",
x = "Time of Day", y = "Mean Client Count"
) +
theme_minimal()
ggplot(df, aes(x = time_of_day, y = total_electricity, group = date)) +
geom_line(alpha = 0.2, color = "blue") +
stat_summary(fun = mean, geom = "line", aes(group = 1),
color = "red", linewidth = 1.2) +
labs(
title = "Daily Profile of Electricity Consumption",
x = "Time of Day", y = "Total Electricity Rate (KWh)"
) +
theme_minimal()
merged_data <- merged_data %>%
mutate(Time_of_Day_in_Minutes = hour(time) * 60 + minute(time))
avg_occupancy <- merged_data %>%
group_by(Time_of_Day_in_Minutes) %>%
summarise(mean_occupancy = mean(Associated_Client_Count, na.rm = TRUE),
.groups = "drop")
peak <- avg_occupancy %>%
filter(mean_occupancy == max(mean_occupancy, na.rm = TRUE))
peak_time_minutes <- peak$Time_of_Day_in_Minutes
peak_value <- peak$mean_occupancy
peak_time <- sprintf("%02d:%02d", peak_time_minutes %/% 60, peak_time_minutes %% 60)
cat("Jam puncak tingkat hunian perpustakaan:", peak_time,
"dengan rata-rata:", round(peak_value, 2), "klien\n")
## Jam puncak tingkat hunian perpustakaan: 15:30 dengan rata-rata: 378.84 klien
correlation <- cor(df$mean_client_count, df$total_electricity, use = "complete.obs")
print("4b. Korelasi Occupancy vs Electricity:")
## [1] "4b. Korelasi Occupancy vs Electricity:"
print(correlation)
## [1] 0.8736819
print("Kasus di mana listrik tinggi tapi occupancy rendah:")
## [1] "Kasus di mana listrik tinggi tapi occupancy rendah:"
# === Regresi Linear Sederhana ===
model <- lm(total_rate ~ Associated_Client_Count, data = merged_data)
# Prediksi energi (rate) berdasarkan model
merged_data$predicted_energy <- predict(model, newdata = merged_data)
# Hitung residual (selisih aktual - prediksi)
merged_data$residual <- merged_data$total_rate - merged_data$predicted_energy
# Hitung threshold (2x standar deviasi residual)
threshold <- 2 * sd(merged_data$residual, na.rm = TRUE)
# Filter mismatch (nilai residual yang "aneh")
mismatch <- merged_data %>%
filter(abs(residual) > threshold) %>%
select(Associated_Client_Count, total_rate, predicted_energy, residual)
# Tampilkan hasil
print(head(mismatch, 20)) # lihat 20 baris pertama
## # A tibble: 20 × 4
## Associated_Client_Count total_rate predicted_energy residual
## <dbl> <dbl> <dbl> <dbl>
## 1 22.8 57.4 97.7 -40.3
## 2 6.75 46 93.8 -47.8
## 3 9 55 94.4 -39.4
## 4 8.75 52 94.3 -42.3
## 5 3.75 52 93.1 -41.1
## 6 3.75 40 93.1 -53.1
## 7 3.75 43 93.1 -50.1
## 8 3.5 48 93.0 -45.0
## 9 5.5 48 93.5 -45.5
## 10 9.5 51 94.5 -43.5
## 11 148 167 128. 39.0
## 12 87.2 154 113. 40.7
## 13 50 146 104. 41.7
## 14 44.6 142 103. 39.0
## 15 7.25 51 93.9 -42.9
## 16 13.4 52 95.4 -43.4
## 17 13 41 95.3 -54.3
## 18 13.2 45 95.4 -50.4
## 19 10.5 44 94.7 -50.7
## 20 7.38 49 94.0 -45.0
cat("Jumlah mismatch:", nrow(mismatch), "\n")
## Jumlah mismatch: 97
# Tandai data mismatch
merged_data <- merged_data %>%
mutate(flag = ifelse(abs(residual) > threshold, "Mismatch", "Normal"))
ggplot(merged_data, aes(x = Associated_Client_Count, y = total_rate)) +
geom_point(aes(color = flag), alpha = 0.6) + # titik normal vs mismatch
geom_smooth(method = "lm", se = FALSE, color = "black", size = 1) + # garis regresi
scale_color_manual(values = c("Normal" = "skyblue", "Mismatch" = "red")) +
labs(
title = "Associated Client Count vs Energy with Mismatch Highlighted",
x = "Associated Client Count",
y = "Energy Consumption",
color = "Legend"
) +
theme_minimal() +
theme(legend.position = "top")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
Separated weekdays and weekend
merged_tagged <- merged_data %>%
mutate(
date = as.Date(time), # Ekstrak hanya tanggal
Hour_Minute = format(time, "%H:%M"),
dow = wday(date, week_start = 1),
day_type = if_else(dow <= 5, "Weekday (Mon–Fri)", "Weekend (Sat–Sun)")
)
daily_summary <- merged_tagged %>%
group_by(date) %>% # Kelompokkan data berdasarkan tanggal
slice(1) %>% # Ambil baris pertama dari setiap kelompok
ungroup() %>% # Hapus pengelompokan
select(date, day_type, time, Associated_Client_Count, total_rate) %>%
rename(Timestamp = time, 'Associated Client Count' = Associated_Client_Count, rate = total_rate)
# Pisahkan data harian
weekdays_data <- daily_summary %>% filter(day_type == "Weekday (Mon–Fri)")
weekends_data <- daily_summary %>% filter(day_type == "Weekend (Sat–Sun)")
# Tampilkan hasil
cat("Data Hari Kerja:\n")
## Data Hari Kerja:
print(head(weekdays_data))
## # A tibble: 6 × 5
## date day_type Timestamp Associated Client Cou…¹ rate
## <date> <chr> <dttm> <dbl> <dbl>
## 1 2020-02-03 Weekday (Mon–Fri) 2020-02-03 00:00:00 39.5 115
## 2 2020-02-04 Weekday (Mon–Fri) 2020-02-04 00:00:00 46.8 117
## 3 2020-02-05 Weekday (Mon–Fri) 2020-02-05 00:00:00 55.5 142
## 4 2020-02-06 Weekday (Mon–Fri) 2020-02-06 00:00:00 32.8 95
## 5 2020-02-07 Weekday (Mon–Fri) 2020-02-07 00:00:00 53.1 109
## 6 2020-02-10 Weekday (Mon–Fri) 2020-02-10 00:00:00 45.8 97
## # ℹ abbreviated name: ¹​`Associated Client Count`
cat("\nData Akhir Pekan:\n")
##
## Data Akhir Pekan:
print(head(weekends_data))
## # A tibble: 6 × 5
## date day_type Timestamp Associated Client Cou…¹ rate
## <date> <chr> <dttm> <dbl> <dbl>
## 1 2020-02-01 Weekend (Sat–Sun) 2020-02-01 00:00:00 22.8 57.4
## 2 2020-02-02 Weekend (Sat–Sun) 2020-02-02 00:00:00 29.4 103
## 3 2020-02-08 Weekend (Sat–Sun) 2020-02-08 00:00:00 21.2 99
## 4 2020-02-09 Weekend (Sat–Sun) 2020-02-09 00:00:00 41 111
## 5 2020-02-15 Weekend (Sat–Sun) 2020-02-15 00:00:00 19.4 87
## 6 2020-02-16 Weekend (Sat–Sun) 2020-02-16 00:00:00 44.5 101
## # ℹ abbreviated name: ¹​`Associated Client Count`
Occupancy Weekday vs Weekend
ggplot(merged_tagged, aes(x = time, y = Associated_Client_Count)) +
geom_line(color = "blue", alpha = 0.6) +
facet_wrap(~day_type, ncol = 1, scales = "free_x") +
labs(title = "Occupancy: Weekday vs Weekend (Split)",
x = "Time", y = "Client Count") +
theme_minimal()
ggplot(merged_tagged, aes(x = time, y = total_rate)) +
geom_line(color = "red", alpha = 0.6) +
facet_wrap(~day_type, ncol = 1, scales = "free_x") +
labs(title = "Electricity: Weekday vs Weekend (Split)",
x = "Time", y = "Total Electricity") +
theme_minimal()
ggplot(merged_tagged, aes(x = Associated_Client_Count, y = total_rate)) +
geom_point(alpha = 0.5, color = "blue") +
facet_wrap(~day_type) +
labs(title = "Scatter: Occupancy vs Electricity (Split by Weekday/Weekend)",
x = "Client Count", y = "Total Electricity") +
theme_minimal()
avg_profile <- merged_tagged %>%
group_by(day_type, Hour_Minute) %>%
summarise(
avg_clients = mean(Associated_Client_Count, na.rm = TRUE),
avg_rate = mean(total_rate, na.rm = TRUE),
.groups = "drop"
)
avg_profile <- avg_profile %>%
mutate(Hour_Minute = hms::as_hms(paste0(Hour_Minute, ":00")))
ggplot(avg_profile, aes(x = Hour_Minute, y = avg_clients, color = day_type, group = day_type)) +
geom_line(size = 1) +
labs(title = "Perbandingan Rata-rata Harian: Occupancy",
x = "Jam dalam Sehari", y = "Jumlah Klien Terhubung",
color = "Jenis Hari") +
scale_x_time(
breaks = hms::as_hms(c("00:00:00", "06:00:00", "12:00:00", "18:00:00", "23:59:59")),
labels = c("00:00", "06:00", "12:00", "18:00", "24:00")
) +
theme_minimal()
ggplot(avg_profile, aes(x = Hour_Minute, y = avg_rate, color = day_type, group = day_type)) +
geom_line(size = 1) +
labs(title = "Perbandingan Rata-rata Harian: Electricity",
x = "Jam dalam Sehari", y = "Tingkat Listrik (KWh)",
color = "Jenis Hari") +
scale_x_time(
breaks = hms::as_hms(c("00:00:00", "06:00:00", "12:00:00", "18:00:00", "23:59:59")),
labels = c("00:00", "06:00", "12:00", "18:00", "24:00")
) +
theme_minimal()