library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'tibble' was built under R version 4.3.3

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## Warning: package 'purrr' was built under R version 4.3.3

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(readr)
library(stringr)

wifi <- read_csv("C:/Users/ASUS/Downloads/wifi.csv")

## Rows: 1883844 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Event Time, Uni, Building, Floor
## dbl  (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib1 <- read_csv("C:/Users/ASUS/Downloads/library1.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib2 <- read_csv("C:/Users/ASUS/Downloads/library2.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib3 <- read_csv("C:/Users/ASUS/Downloads/library3.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2. Filter Library WiFi

library_wifi_data <- wifi %>%
  filter(str_trim(Building) == "Library") %>% 
  mutate(time = ymd_hms(time))
print(library_wifi_data)

## # A tibble: 28,652 × 7
##    time                `Event Time`                 `Associated Client Count`
##    <dttm>              <chr>                                            <dbl>
##  1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                        29
##  2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                         0
##  3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                        21
##  4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                        38
##  5 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                         0
##  6 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                        33
##  7 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                        22
##  8 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                        39
##  9 2020-02-01 00:12:43 Sat Feb 01 00:12:43 UTC 2020                        30
## 10 2020-02-01 00:12:43 Sat Feb 01 00:12:43 UTC 2020                         0
## # ℹ 28,642 more rows
## # ℹ 4 more variables: `Authenticated Client Count` <dbl>, Uni <chr>,
## #   Building <chr>, Floor <chr>

wifi_resampled <- library_wifi_data %>%
  mutate(time = floor_date(time, "10 minutes")) %>%
  group_by(time) %>%
  summarise(Associated_Client_Count = mean(`Associated Client Count`, na.rm = TRUE),
            .groups = "drop")

cat("WiFi Library (10 menit):\n")

## WiFi Library (10 menit):

print(head(wifi_resampled))

## # A tibble: 6 × 2
##   time                Associated_Client_Count
##   <dttm>                                <dbl>
## 1 2020-02-01 00:00:00                    22.8
## 2 2020-02-01 00:10:00                    20.6
## 3 2020-02-01 00:20:00                    17.2
## 4 2020-02-01 00:30:00                    15.4
## 5 2020-02-01 00:40:00                    12.4
## 6 2020-02-01 00:50:00                    10.5

3. Standarkan kolom ts di lib1/lib2/lib3

print("Struktur timestamp dari masing-masing listrik:")

## [1] "Struktur timestamp dari masing-masing listrik:"

print(str(lib1$ts))

##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL

print(str(lib2$ts))

##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL

print(str(lib3$ts))

##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL

Data Cleaning

lib1 <- lib1 %>%
  mutate(ts = parse_date_time(ts, orders = c("mdy HM", "mdy HMS")))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `ts = parse_date_time(ts, orders = c("mdy HM", "mdy HMS"))`.
## Caused by warning:
## ! All formats failed to parse. No formats found.

print(str(lib1$ts))

##  POSIXct[1:18864], format: NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA ...
## NULL

4. Gabungkan data listrik jadi satu

electricity_df <- bind_rows(lib1, lib2, lib3)

# Imputasi nilai rate yang NA dengan rata-rata 144 data pertama
electricity_df <- electricity_df %>%
  group_by(name) %>%
  mutate(rate = ifelse(is.na(rate),
                       mean(head(rate, 144), na.rm = TRUE),
                       rate)) %>%
  ungroup()

5. Agregasi listrik per 10 menit

total_electricity144 <- electricity_df %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(total_rate = sum(rate, na.rm = TRUE), .groups = "drop") %>%
  rename(time = ts)

cat("Data Listrik per 10 menit:\n")

## Data Listrik per 10 menit:

print(head(total_electricity144))

## # A tibble: 6 × 2
##   time                total_rate
##   <dttm>                   <dbl>
## 1 2020-01-01 00:00:00       57.4
## 2 2020-01-01 00:10:00       75  
## 3 2020-01-01 00:20:00       64  
## 4 2020-01-01 00:30:00       83  
## 5 2020-01-01 00:40:00       78  
## 6 2020-01-01 00:50:00       52

6.Gabungkan WiFi & Listrik

merged_data <- wifi_resampled %>%
  inner_join(total_electricity144, by = "time")

cat("Merged Data:\n")

## Merged Data:

print(head(merged_data))

## # A tibble: 6 × 3
##   time                Associated_Client_Count total_rate
##   <dttm>                                <dbl>      <dbl>
## 1 2020-02-01 00:00:00                    22.8       57.4
## 2 2020-02-01 00:10:00                    20.6       84  
## 3 2020-02-01 00:20:00                    17.2       83  
## 4 2020-02-01 00:30:00                    15.4       89  
## 5 2020-02-01 00:40:00                    12.4       78  
## 6 2020-02-01 00:50:00                    10.5       80

Cek duplikasi

cat("\nJumlah duplikat sebelum dihapus:\n")

## 
## Jumlah duplikat sebelum dihapus:

print(sum(duplicated(total_electricity144)))

## [1] 0

7. Contoh Plot Occupancy vs Electricity

ggplot(merged_data, aes(x = time)) +
  geom_line(aes(y = Associated_Client_Count, color = "Occupancy")) +
  geom_line(aes(y = total_rate, color = "Electricity")) +
  labs(title = "Time Series: Occupancy vs Electricity",
       x = "Time", y = "Value", color = "Legend") +
  theme_minimal()

## 8. Scatter Plot

ggplot(merged_data, aes(x = Associated_Client_Count, y = total_rate)) +
  geom_point(alpha = 0.5, color = 'pink') +
  labs(title = "Scatter: Occupancy vs Electricity",
       x = "Mean Client Count", y = "Total Electricity") +
  theme_minimal()

## 9. Daily Profiles (24h) + average line

# Ambil data dari merged_data, lalu siapkan kolom date dan time_of_day
df <- merged_data %>%
  mutate(
    date = as.Date(time),
    time_of_day = hms::as_hms(time),
    mean_client_count = Associated_Client_Count,
    total_electricity = total_rate
  )

Plot Daily Profile Occupancy

ggplot(df, aes(x = time_of_day, y = mean_client_count, group = date)) +
  geom_line(alpha = 0.2, color = "blue") +   # garis tipis per hari
  stat_summary(fun = mean, geom = "line", aes(group = 1),
               color = "red", linewidth = 1.2) +  # rata-rata harian
  labs(
    title = "Daily Profile of Occupancy",
    x = "Time of Day", y = "Mean Client Count"
  ) +
  theme_minimal()

Plot Daily Profile Electricity

ggplot(df, aes(x = time_of_day, y = total_electricity, group = date)) +
  geom_line(alpha = 0.2, color = "blue") +
  stat_summary(fun = mean, geom = "line", aes(group = 1),
               color = "red", linewidth = 1.2) +
  labs(
    title = "Daily Profile of Electricity Consumption",
    x = "Time of Day", y = "Total Electricity Rate (KWh)"
  ) +
  theme_minimal()

ANALISIS

a. Identify peak hours of occupancy

merged_data <- merged_data %>%
  mutate(Time_of_Day_in_Minutes = hour(time) * 60 + minute(time))

avg_occupancy <- merged_data %>%
  group_by(Time_of_Day_in_Minutes) %>%
  summarise(mean_occupancy = mean(Associated_Client_Count, na.rm = TRUE),
            .groups = "drop")

peak <- avg_occupancy %>%
  filter(mean_occupancy == max(mean_occupancy, na.rm = TRUE))
peak_time_minutes <- peak$Time_of_Day_in_Minutes
peak_value <- peak$mean_occupancy
peak_time <- sprintf("%02d:%02d", peak_time_minutes %/% 60, peak_time_minutes %% 60)

cat("Jam puncak tingkat hunian perpustakaan:", peak_time, 
    "dengan rata-rata:", round(peak_value, 2), "klien\n")

## Jam puncak tingkat hunian perpustakaan: 15:30 dengan rata-rata: 378.84 klien

b. Correlation

correlation <- cor(df$mean_client_count, df$total_electricity, use = "complete.obs")
print("4b. Korelasi Occupancy vs Electricity:")

## [1] "4b. Korelasi Occupancy vs Electricity:"

print(correlation)

## [1] 0.8736819

c. Highlight anomaly cases

print("Kasus di mana listrik tinggi tapi occupancy rendah:")

## [1] "Kasus di mana listrik tinggi tapi occupancy rendah:"

# === Regresi Linear Sederhana ===
model <- lm(total_rate ~ Associated_Client_Count, data = merged_data)
# Prediksi energi (rate) berdasarkan model
merged_data$predicted_energy <- predict(model, newdata = merged_data)
# Hitung residual (selisih aktual - prediksi)
merged_data$residual <- merged_data$total_rate - merged_data$predicted_energy
# Hitung threshold (2x standar deviasi residual)
threshold <- 2 * sd(merged_data$residual, na.rm = TRUE)
# Filter mismatch (nilai residual yang "aneh")
mismatch <- merged_data %>%
  filter(abs(residual) > threshold) %>%
  select(Associated_Client_Count, total_rate, predicted_energy, residual)
# Tampilkan hasil
print(head(mismatch, 20))  # lihat 20 baris pertama

## # A tibble: 20 × 4
##    Associated_Client_Count total_rate predicted_energy residual
##                      <dbl>      <dbl>            <dbl>    <dbl>
##  1                   22.8        57.4             97.7    -40.3
##  2                    6.75       46               93.8    -47.8
##  3                    9          55               94.4    -39.4
##  4                    8.75       52               94.3    -42.3
##  5                    3.75       52               93.1    -41.1
##  6                    3.75       40               93.1    -53.1
##  7                    3.75       43               93.1    -50.1
##  8                    3.5        48               93.0    -45.0
##  9                    5.5        48               93.5    -45.5
## 10                    9.5        51               94.5    -43.5
## 11                  148         167              128.      39.0
## 12                   87.2       154              113.      40.7
## 13                   50         146              104.      41.7
## 14                   44.6       142              103.      39.0
## 15                    7.25       51               93.9    -42.9
## 16                   13.4        52               95.4    -43.4
## 17                   13          41               95.3    -54.3
## 18                   13.2        45               95.4    -50.4
## 19                   10.5        44               94.7    -50.7
## 20                    7.38       49               94.0    -45.0

cat("Jumlah mismatch:", nrow(mismatch), "\n")

## Jumlah mismatch: 97

membuat scatter plot mismatch

# Tandai data mismatch
merged_data <- merged_data %>%
  mutate(flag = ifelse(abs(residual) > threshold, "Mismatch", "Normal"))

ggplot(merged_data, aes(x = Associated_Client_Count, y = total_rate)) +
  geom_point(aes(color = flag), alpha = 0.6) +  # titik normal vs mismatch
  geom_smooth(method = "lm", se = FALSE, color = "black", size = 1) + # garis regresi
  scale_color_manual(values = c("Normal" = "skyblue", "Mismatch" = "red")) +
  labs(
    title = "Associated Client Count vs Energy with Mismatch Highlighted",
    x = "Associated Client Count",
    y = "Energy Consumption",
    color = "Legend"
  ) +
  theme_minimal() +
  theme(legend.position = "top")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

5. Weekend vs Weekday Comparison

Separated weekdays and weekend

merged_tagged <- merged_data %>%
  mutate(
    date = as.Date(time),  # Ekstrak hanya tanggal
    Hour_Minute = format(time, "%H:%M"),
    dow = wday(date, week_start = 1),
    day_type = if_else(dow <= 5, "Weekday (Mon–Fri)", "Weekend (Sat–Sun)")
  )

Agregasi data harian

daily_summary <- merged_tagged %>%
  group_by(date) %>%  # Kelompokkan data berdasarkan tanggal
  slice(1) %>%       # Ambil baris pertama dari setiap kelompok
  ungroup() %>%      # Hapus pengelompokan
  select(date, day_type, time, Associated_Client_Count, total_rate) %>%
  rename(Timestamp = time, 'Associated Client Count' = Associated_Client_Count, rate = total_rate)

# Pisahkan data harian 
weekdays_data <- daily_summary %>% filter(day_type == "Weekday (Mon–Fri)")
weekends_data <- daily_summary %>% filter(day_type == "Weekend (Sat–Sun)")

# Tampilkan hasil
cat("Data Hari Kerja:\n")

## Data Hari Kerja:

print(head(weekdays_data))

## # A tibble: 6 × 5
##   date       day_type          Timestamp           Associated Client Cou…¹  rate
##   <date>     <chr>             <dttm>                                <dbl> <dbl>
## 1 2020-02-03 Weekday (Mon–Fri) 2020-02-03 00:00:00                    39.5   115
## 2 2020-02-04 Weekday (Mon–Fri) 2020-02-04 00:00:00                    46.8   117
## 3 2020-02-05 Weekday (Mon–Fri) 2020-02-05 00:00:00                    55.5   142
## 4 2020-02-06 Weekday (Mon–Fri) 2020-02-06 00:00:00                    32.8    95
## 5 2020-02-07 Weekday (Mon–Fri) 2020-02-07 00:00:00                    53.1   109
## 6 2020-02-10 Weekday (Mon–Fri) 2020-02-10 00:00:00                    45.8    97
## # ℹ abbreviated name: ¹`Associated Client Count`

cat("\nData Akhir Pekan:\n")

## 
## Data Akhir Pekan:

print(head(weekends_data))

## # A tibble: 6 × 5
##   date       day_type          Timestamp           Associated Client Cou…¹  rate
##   <date>     <chr>             <dttm>                                <dbl> <dbl>
## 1 2020-02-01 Weekend (Sat–Sun) 2020-02-01 00:00:00                    22.8  57.4
## 2 2020-02-02 Weekend (Sat–Sun) 2020-02-02 00:00:00                    29.4 103  
## 3 2020-02-08 Weekend (Sat–Sun) 2020-02-08 00:00:00                    21.2  99  
## 4 2020-02-09 Weekend (Sat–Sun) 2020-02-09 00:00:00                    41   111  
## 5 2020-02-15 Weekend (Sat–Sun) 2020-02-15 00:00:00                    19.4  87  
## 6 2020-02-16 Weekend (Sat–Sun) 2020-02-16 00:00:00                    44.5 101  
## # ℹ abbreviated name: ¹`Associated Client Count`

Reproduce the previous visualizations for both categories.

1. Time Series Plot: Occupancy dan Electricity

Occupancy Weekday vs Weekend

ggplot(merged_tagged, aes(x = time, y = Associated_Client_Count)) +
  geom_line(color = "blue", alpha = 0.6) +
  facet_wrap(~day_type, ncol = 1, scales = "free_x") +
  labs(title = "Occupancy: Weekday vs Weekend (Split)",
       x = "Time", y = "Client Count") +
  theme_minimal()

Electricity Weekday vs Weekend

ggplot(merged_tagged, aes(x = time, y = total_rate)) +
  geom_line(color = "red", alpha = 0.6) +
  facet_wrap(~day_type, ncol = 1, scales = "free_x") +
  labs(title = "Electricity: Weekday vs Weekend (Split)",
       x = "Time", y = "Total Electricity") +
  theme_minimal()

2. Scatter Weekday vs Weekend

ggplot(merged_tagged, aes(x = Associated_Client_Count, y = total_rate)) +
  geom_point(alpha = 0.5, color = "blue") +
  facet_wrap(~day_type) +
  labs(title = "Scatter: Occupancy vs Electricity (Split by Weekday/Weekend)",
       x = "Client Count", y = "Total Electricity") +
  theme_minimal()

3. Hitung rata-rata weekday vs weekend

avg_profile <- merged_tagged %>%
  group_by(day_type, Hour_Minute) %>%
  summarise(
    avg_clients = mean(Associated_Client_Count, na.rm = TRUE),
    avg_rate = mean(total_rate, na.rm = TRUE),
    .groups = "drop"
  )

4. Plot Occupancy rata-rata weekday vs weekend

avg_profile <- avg_profile %>%
  mutate(Hour_Minute = hms::as_hms(paste0(Hour_Minute, ":00")))

Plot ulang

ggplot(avg_profile, aes(x = Hour_Minute, y = avg_clients, color = day_type, group = day_type)) +
  geom_line(size = 1) +
  labs(title = "Perbandingan Rata-rata Harian: Occupancy",
       x = "Jam dalam Sehari", y = "Jumlah Klien Terhubung",
       color = "Jenis Hari") +
  scale_x_time(
    breaks = hms::as_hms(c("00:00:00", "06:00:00", "12:00:00", "18:00:00", "23:59:59")),
    labels = c("00:00", "06:00", "12:00", "18:00", "24:00")
  ) +
  theme_minimal()

5. Plot Electricity rata-rata weekday vs weekend

ggplot(avg_profile, aes(x = Hour_Minute, y = avg_rate, color = day_type, group = day_type)) +
  geom_line(size = 1) +
  labs(title = "Perbandingan Rata-rata Harian: Electricity",
       x = "Jam dalam Sehari", y = "Tingkat Listrik (KWh)",
       color = "Jenis Hari") +
  scale_x_time(
    breaks = hms::as_hms(c("00:00:00", "06:00:00", "12:00:00", "18:00:00", "23:59:59")),
    labels = c("00:00", "06:00", "12:00", "18:00", "24:00")
  ) +
  theme_minimal()

Kel12_Data Mining & Visualization

2025-09-14

2. Filter Library WiFi

3. Standarkan kolom ts di lib1/lib2/lib3

Data Cleaning

4. Gabungkan data listrik jadi satu

5. Agregasi listrik per 10 menit

6.Gabungkan WiFi & Listrik

Cek duplikasi

7. Contoh Plot Occupancy vs Electricity

Plot Daily Profile Occupancy

Plot Daily Profile Electricity

ANALISIS

a. Identify peak hours of occupancy

b. Correlation

c. Highlight anomaly cases

membuat scatter plot mismatch

5. Weekend vs Weekday Comparison

Agregasi data harian

Reproduce the previous visualizations for both categories.

1. Time Series Plot: Occupancy dan Electricity

Electricity Weekday vs Weekend

2. Scatter Weekday vs Weekend

3. Hitung rata-rata weekday vs weekend

4. Plot Occupancy rata-rata weekday vs weekend

Plot ulang

5. Plot Electricity rata-rata weekday vs weekend