Nama anggota:

Ferdian Dhanino Rachmawan (5003231102)
Tabitha Madeline De Yong (5003231110)
Khoirun Nisa (5003231111)
Michael Tannelson (5003231120)

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(tidyr)
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

# membaca data csv
wifi<-read_csv("C://Users//khoir//OneDrive - Institut Teknologi Sepuluh Nopember//Documents//1. SMT 5//DATA MINING//wifi.csv")

## Rows: 1883844 Columns: 7

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Event Time, Uni, Building, Floor
## dbl  (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

wifi

# Menampilkan data lengkap library1
lib1<-read_csv("C://Users//khoir//OneDrive - Institut Teknologi Sepuluh Nopember//Documents//1. SMT 5//DATA MINING//library1.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib1

# Menampilkan data lengkap library2
lib2<-read_csv("C://Users//khoir//OneDrive - Institut Teknologi Sepuluh Nopember//Documents//1. SMT 5//DATA MINING//library2.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib2

# Menampilkan data lengkap library3
lib3<-read_csv("C://Users//khoir//OneDrive - Institut Teknologi Sepuluh Nopember//Documents//1. SMT 5//DATA MINING//library3.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib3

# Memfilter mengambil data wifi dengan ketentuan place hanya "Library"
wifi_clean<-wifi %>%
  filter(Building == "Library")
wifi_clean

# Meresample data wifi clean ke interval 10 menit
wifi_resampled <- wifi_clean %>%
  mutate(time = floor_date(time, "10 minutes")) %>%
  group_by(time) %>%
  summarise(mean_client = mean(`Associated Client Count`, na.rm = TRUE),
            .groups = "drop")
wifi_resampled

energy_raw <- bind_rows(lib1, lib2, lib3)
energy_raw

DATA CLEANING

# Mengubah tipe waktu dan Menghapus duplikat pada dataset library (jika ada)
lib1 <- lib1 %>%
  mutate(ts = as_datetime(ts, tz="UTC")) %>%
  distinct()
lib2 <- lib2 %>%
  mutate(ts = as_datetime(ts, tz="UTC")) %>%
  distinct()
lib3 <- lib3 %>%
  mutate(ts = as_datetime(ts, tz="UTC")) %>%
  distinct()

print(lib1)

## # A tibble: 18,864 × 6
##    ts                  name              reading units cumulative  rate
##    <dttm>              <chr>               <dbl> <chr>      <dbl> <dbl>
##  1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh      1489442    NA
##  2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh      1489449     7
##  3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh      1489456     7
##  4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh      1489464     8
##  5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh      1489471     7
##  6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh      1489479     8
##  7 2020-01-01 01:00:00 MC065-L01/M9R2048 1489486 KWh      1489486     7
##  8 2020-01-01 01:10:00 MC065-L01/M9R2048 1489494 KWh      1489494     8
##  9 2020-01-01 01:20:00 MC065-L01/M9R2048 1489501 KWh      1489501     7
## 10 2020-01-01 01:30:00 MC065-L01/M9R2048 1489509 KWh      1489509     8
## # ℹ 18,854 more rows

print(lib2)

## # A tibble: 18,864 × 6
##    ts                  name               reading units cumulative  rate
##    <dttm>              <chr>                <dbl> <chr>      <dbl> <dbl>
##  1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh      2129016    NA
##  2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh      2129034    18
##  3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh      2129054    20
##  4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh      2129071    17
##  5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh      2129086    15
##  6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh      2129103    17
##  7 2020-01-01 01:00:00 MC065-L01/M11R2056 2129120 KWh      2129120    17
##  8 2020-01-01 01:10:00 MC065-L01/M11R2056 2129137 KWh      2129137    17
##  9 2020-01-01 01:20:00 MC065-L01/M11R2056 2129154 KWh      2129154    17
## 10 2020-01-01 01:30:00 MC065-L01/M11R2056 2129171 KWh      2129171    17
## # ℹ 18,854 more rows

print(lib3)

## # A tibble: 18,864 × 6
##    ts                  name               reading units cumulative  rate
##    <dttm>              <chr>                <dbl> <chr>      <dbl> <dbl>
##  1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh      6914209    NA
##  2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh      6914266    57
##  3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh      6914310    44
##  4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh      6914376    66
##  5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh      6914439    63
##  6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh      6914474    35
##  7 2020-01-01 01:00:00 MC065-L01/M13R2064 6914521 KWh      6914521    47
##  8 2020-01-01 01:10:00 MC065-L01/M13R2064 6914576 KWh      6914576    55
##  9 2020-01-01 01:20:00 MC065-L01/M13R2064 6914610 KWh      6914610    34
## 10 2020-01-01 01:30:00 MC065-L01/M13R2064 6914651 KWh      6914651    41
## # ℹ 18,854 more rows

# Fungsi untuk mengisi nilai MV dengan mean dari 144 obs pertama/teratas
fill_missing_avg <- function(df){
  baseline <- mean(df$rate[1:144], na.rm = TRUE)
  df %>%
    mutate(rate = ifelse(is.na(rate), baseline, rate)
    )
}

# cek missing values sebelum cleaning
cat("Missing Values BEFORE cleaning:\n")

## Missing Values BEFORE cleaning:

sapply(
  list(lib1=lib1, lib2=lib2, lib3=lib3),
  function(df) sum(is.na(df$rate))
)

## lib1 lib2 lib3 
## 3047 3047 3047

# apply ke masing-masing dataset
library1_clean <- fill_missing_avg(lib1)
library2_clean <- fill_missing_avg(lib2)
library3_clean <- fill_missing_avg(lib3)

# cek missing values setelah cleaning
cat("Missing values AFTER cleaning:\n")

## Missing values AFTER cleaning:

sapply(
  list(library1_clean=library1_clean,
       library2_clean=library2_clean,
       library3_clean=library3_clean),
  function(df) sum(is.na(df$rate))
)

## library1_clean library2_clean library3_clean 
##              0              0              0

# Menggabung library yang sudah clean
energy_clean <- bind_rows(library1_clean, library2_clean, library3_clean)

# Pastikan ts datetime
energy_clean <- energy_clean %>%
  mutate(ts = as_datetime(ts, tz="UTC"))

# Memastikan data dimulai dari waktu 00:00:00
ts_start <- as.POSIXct(paste0(as.Date(min(energy_clean$ts)), " 00:00:00"), tz="UTC")
ts_end   <- max(energy_clean$ts, na.rm=TRUE)

# Resampling 10 menit
energy_resampled <- energy_clean %>%
  mutate(ts_10min = floor_date(ts, "10 minutes")) %>%
  group_by(ts_10min) %>%
  summarise(rate = sum(rate, na.rm=TRUE), .groups="drop") %>%
  complete(ts_10min = seq(ts_start, ts_end, by="10 min"),
           fill = list(rate = 0)) %>%
  arrange(ts_10min)

# Cek hasilnya
cat("Resampled electricity data shape:\n")

## Resampled electricity data shape:

print(dim(energy_resampled))

## [1] 18864     2

head(energy_resampled)

Visualization of Occupancy

# Merge data occupancy dan energy
occupancy_energy <- wifi_resampled %>%
  rename(occupancy = mean_client) %>%
  inner_join(energy_resampled %>% rename (energy = rate),
             by = c("time"="ts_10min"))

head(occupancy_energy)

# time series plot occupancy dan energi
ggplot(occupancy_energy, aes(x=time))+
  geom_line(aes(y = occupancy, color= "Occupancy"))+
  geom_line(aes(y = energy, color= "Energy"))+
  scale_y_continuous(
    name = "Occupancy",
    sec.axis = sec_axis(~., name = "Energy")
  )+
  labs(title = "Time Series: Occupancy & Energy Consumption", x = "Time")+
  scale_color_manual(values = c("Occupancy" = "blue", "Energy" = "orange"))+ theme_minimal()

#Scatter Plot
ggplot(occupancy_energy, aes(x = occupancy, y= energy))+
  geom_point(alpha=0.4, color = "blue")+
  geom_smooth(method="lm", color="salmon")+
  labs(title= "Scatter Plot : Occupancy vs Energy",
       x = "Occupancy (clients)",
       y = "Energy Consumption (rate)")+
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Daily Profiles (24H)
occupancy_energy <- occupancy_energy %>%
  mutate(hour = hour(time),
         date = as.Date(time))

# Daily Profiles: Satu garis per hari sesuai perintah
ggplot(occupancy_energy, aes(x = hour, y = occupancy, group = date))+
  geom_line(alpha = 0.3, color = "darkgreen")+
  stat_summary(aes(group=1), fun = mean, geom = "line", color = "purple", size = 1.2)+
  labs(title = "Daily Occupancy Profiles with Average (rata)",
       x = "Hour of Day",
       y = "Occupancy") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Mengidentifikasi Peak Hours
occupancy_energy %>%
  group_by(hour) %>%
  summarise(avg_occupancy = mean(occupancy),
            avg_energy = mean(energy)) %>%
  arrange(desc(avg_occupancy))

# Mengetahui apakah occupancy secara signifikan berpengaruh terhadap konsumsi listrik (energi)
cor(occupancy_energy$occupancy, occupancy_energy$energy)

## [1] 0.8782117

# Highlight 10 kasus energy tidak sesuai occupancy
top_residuals <- occupancy_energy %>%
  # hitung residual langsung dari model linear sementara
  mutate(residual = energy - predict(lm(energy ~ occupancy, data = .))) %>%
  arrange(desc(abs(residual))) %>%
  slice_head(n = 10)  # 10 kasus ekstrem di data

top_residuals

# Weekends Vs Weekday Comparison
# Memisahkan dataset weekdays (senin-jumat) dan weekend (sabtu-minggu)
occupancy_energy <- occupancy_energy %>%
  mutate(day_type = ifelse(wday(date) %in% c(1,7), "Weekend", "Weekday"))
occupancy_energy

# Occupancy Time Series antara weekday dan weekend
ggplot(occupancy_energy, aes(x = time, y = occupancy, color = day_type)) +
  geom_line(alpha = 0.6) +
  labs(title = "Occupancy Over Time: Weekday vs Weekend",
       x = "Time", y = "Occupancy") +
  theme_minimal()

# Energy Time Series antara weekday dengan weekend
ggplot(occupancy_energy, aes(x = time, y = energy, color = day_type)) +
  geom_line(alpha = 0.6) +
  labs(title = "Energy Consumption Over Time: Weekday vs Weekend",
       x = "Time", y = "Energy") +
  theme_minimal()

#Scatter plot (occupancy vs energy)
ggplot(occupancy_energy, aes(x = occupancy, y = energy, color = day_type)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, size = 1) +
  labs(title = "Occupancy vs Energy: Weekday vs Weekend",
       x = "Occupancy", y = "Energy") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Daily profile per day (24h)
ggplot(occupancy_energy, aes(x = hour, y = occupancy, group = date, color = day_type)) +
  geom_line(alpha = 0.3) +
  stat_summary(aes(group=day_type), fun = mean, geom = "line", size = 1.2) +
  labs(title = "Daily Occupancy Profiles: Weekday vs Weekend",
       x = "Hour of Day",
       y = "Occupancy") +
  theme_minimal()

Data Mining AdventureR

Visualization of Occupancy