Coding Library Energy Analysis

Data Mining Kelompok 3 Anggota Kelompok. Ara Dina Mumtaza (5003231104). Dea Kharisma Nuril Ba’is (5003231109). Jasmine Sabrina Rahmaira Saputra (5003231114). Artera Dwi Septianingtyas (5003231118)

TAHAPAN

Import library yang diperlukan dan load data

# 1. Import Library & Data
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## Warning: package 'lubridate' was built under R version 4.3.3

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(readr)

## Warning: package 'readr' was built under R version 4.3.3

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'purrr' was built under R version 4.3.3

## Warning: package 'stringr' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(gridExtra) # For arranging plots

## Warning: package 'gridExtra' was built under R version 4.3.3

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(forecast)  # For time series models

## Warning: package 'forecast' was built under R version 4.3.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(scales)    # For formatting axis labels

## Warning: package 'scales' was built under R version 4.3.3

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(ggplot2)

Load Data

# 2. Input Data
wifi     <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/wifi.csv")

## Rows: 1883844 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Event Time, Uni, Building, Floor
## dbl  (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library1 <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/library1.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library2 <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/library2.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library3 <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/library3.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Mengecek data yang terbaca

head(wifi)

## # A tibble: 6 × 7
##   time                `Event Time` Associated Client Co…¹ Authenticated Client…²
##   <dttm>              <chr>                         <dbl>                  <dbl>
## 1 2020-02-01 00:02:12 Sat Feb 01 …                    184                    182
## 2 2020-02-01 00:02:12 Sat Feb 01 …                      6                      6
## 3 2020-02-01 00:02:12 Sat Feb 01 …                     18                     18
## 4 2020-02-01 00:02:12 Sat Feb 01 …                     23                     23
## 5 2020-02-01 00:02:12 Sat Feb 01 …                     45                     45
## 6 2020-02-01 00:02:12 Sat Feb 01 …                     16                     16
## # ℹ abbreviated names: ¹`Associated Client Count`,
## #   ²`Authenticated Client Count`
## # ℹ 3 more variables: Uni <chr>, Building <chr>, Floor <chr>

head(library1)

## # A tibble: 6 × 6
##   ts                  name              reading units cumulative  rate
##   <dttm>              <chr>               <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh      1489442    NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh      1489449     7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh      1489456     7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh      1489464     8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh      1489471     7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh      1489479     8

head(library2)

## # A tibble: 6 × 6
##   ts                  name               reading units cumulative  rate
##   <dttm>              <chr>                <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh      2129016    NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh      2129034    18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh      2129054    20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh      2129071    17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh      2129086    15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh      2129103    17

head(library3)

## # A tibble: 6 × 6
##   ts                  name               reading units cumulative  rate
##   <dttm>              <chr>                <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh      6914209    NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh      6914266    57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh      6914310    44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh      6914376    66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh      6914439    63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh      6914474    35

Filter & Synchronize the Data

Use only Library building WiFi records.
Ensure all data is standardized at 10-minute intervals:
- Resample WiFi data (mean of Associated Client Count).
- Aggregate library meters into total electricity rate per 10 minutes.

Filter hanya Library pada wifi

wifi_library <- wifi %>%
  filter(grepl("Library", Building, ignore.case = TRUE)) %>%
  mutate(Timestamp = ymd_hms(time))

head(wifi_library)

## # A tibble: 6 × 8
##   time                `Event Time` Associated Client Co…¹ Authenticated Client…²
##   <dttm>              <chr>                         <dbl>                  <dbl>
## 1 2020-02-01 00:02:12 Sat Feb 01 …                      1                      0
## 2 2020-02-01 00:02:12 Sat Feb 01 …                     29                     28
## 3 2020-02-01 00:02:12 Sat Feb 01 …                      0                      0
## 4 2020-02-01 00:02:12 Sat Feb 01 …                      0                      0
## 5 2020-02-01 00:02:12 Sat Feb 01 …                      0                      0
## 6 2020-02-01 00:02:12 Sat Feb 01 …                     21                     20
## # ℹ abbreviated names: ¹`Associated Client Count`,
## #   ²`Authenticated Client Count`
## # ℹ 4 more variables: Uni <chr>, Building <chr>, Floor <chr>, Timestamp <dttm>

Resample WiFi ke interval 10 menit (jawaban telah sesuai dengan komputasi python)

wifi_10 <- wifi_library %>%
  mutate(Timestamp = floor_date(Timestamp, "10 minutes")) %>%
  group_by(Timestamp) %>%
  summarise(occupancy = mean(`Associated Client Count`, na.rm = TRUE)) %>%
  ungroup()

print(wifi_10)

## # A tibble: 3,683 × 2
##    Timestamp           occupancy
##    <dttm>                  <dbl>
##  1 2020-02-01 00:00:00     13.1 
##  2 2020-02-01 00:10:00     11.8 
##  3 2020-02-01 00:20:00     10   
##  4 2020-02-01 00:30:00      8.79
##  5 2020-02-01 00:40:00      7.07
##  6 2020-02-01 00:50:00      6   
##  7 2020-02-01 01:00:00      5.29
##  8 2020-02-01 01:10:00      4.95
##  9 2020-02-01 01:20:00      4.86
## 10 2020-02-01 01:30:00      4.79
## # ℹ 3,673 more rows

Resample masing-masing file library ke 10 menit dan menggabungkan data library serta menghitung total atau jumlah energi dalam satu dataframe “energy”

library1_10 <- library1 %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(library1 = mean(rate, na.rm = TRUE), .groups = "drop")

library2_10 <- library2 %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(library2 = mean(rate, na.rm = TRUE), .groups = "drop")

library3_10 <- library3 %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(library3 = mean(rate, na.rm = TRUE), .groups = "drop")

# Gabungkan jadi satu data frame
energy <- library1_10 %>%
  full_join(library2_10, by = "ts") %>%
  full_join(library3_10, by = "ts") %>%
  mutate(total_energy = library1 + library2 + library3)

print(energy)

## # A tibble: 18,864 × 5
##    ts                  library1 library2 library3 total_energy
##    <dttm>                 <dbl>    <dbl>    <dbl>        <dbl>
##  1 2020-01-01 00:00:00      NaN      NaN      NaN          NaN
##  2 2020-01-01 00:10:00        7       18       57           82
##  3 2020-01-01 00:20:00        7       20       44           71
##  4 2020-01-01 00:30:00        8       17       66           91
##  5 2020-01-01 00:40:00        7       15       63           85
##  6 2020-01-01 00:50:00        8       17       35           60
##  7 2020-01-01 01:00:00        7       17       47           71
##  8 2020-01-01 01:10:00        8       17       55           80
##  9 2020-01-01 01:20:00        7       17       34           58
## 10 2020-01-01 01:30:00        8       17       41           66
## # ℹ 18,854 more rows

Data Cleaning

Address missing electricity readings by filling with the average of the first 144 observations (per energy meter).
Standardize timestamps and remove duplicates (if any).
Document each cleaning step.

Step 1. Cek apakah terdapat missing value pada tiap data

sum(is.na(wifi_10))

## [1] 0

sum(is.na(library1_10))

## [1] 3047

sum(is.na(library2_10))

## [1] 3047

sum(is.na(library3_10))

## [1] 3047

Note: missing value hanya ada pada data library (sehingga data wifi tidak perlu dikomputasi)

fill_first144 <- function(x) {
  x[is.na(x)] <- mean(x[1:144], na.rm = TRUE)
  return(x)
}

library1_10$library1 <- fill_first144(library1_10$library1)
library2_10$library2 <- fill_first144(library2_10$library2)
library3_10$library3 <- fill_first144(library3_10$library3)

Mengecek apakah missing value sudah hilang

sum(is.na(library1_10))

## [1] 0

sum(is.na(library2_10))

## [1] 0

sum(is.na(library2_10))

## [1] 0

Standarisasi timestamp dan menggabungkan dataset serta Menggabungkan wifi dan library jadi satu data frame

df <- wifi_10 %>%
  full_join(library1_10, by = c("Timestamp" = "ts")) %>%
  full_join(library2_10, by = c("Timestamp" = "ts")) %>%
  full_join(library3_10, by = c("Timestamp" = "ts")) %>%
  mutate(across(c(library1, library2, library3), ~ replace_na(., 0))) %>%
  mutate(total_energy = library1 + library2 + library3)

print(df)

## # A tibble: 18,864 × 6
##    Timestamp           occupancy library1 library2 library3 total_energy
##    <dttm>                  <dbl>    <dbl>    <dbl>    <dbl>        <dbl>
##  1 2020-02-01 00:00:00     13.1      7.38     17.9     39.6         64.8
##  2 2020-02-01 00:10:00     11.8     13        11       73           97  
##  3 2020-02-01 00:20:00     10       13        11       72           96  
##  4 2020-02-01 00:30:00      8.79    11        15       74          100  
##  5 2020-02-01 00:40:00      7.07    11        13       65           89  
##  6 2020-02-01 00:50:00      6       11        16       64           91  
##  7 2020-02-01 01:00:00      5.29    10        16       64           90  
##  8 2020-02-01 01:10:00      4.95    11        15       66           92  
##  9 2020-02-01 01:20:00      4.86    11        15       61           87  
## 10 2020-02-01 01:30:00      4.79    12        13       58           83  
## # ℹ 18,854 more rows

Visualization of Occupancy–Energy Relationship

Time Series Plots: show occupancy and energy consumption patterns over the month
Scatter Plot: occupancy (X) vs energy consumption (Y).
Daily Profiles (24h): illustrate daily cycles (each day represented by one line of time series).
Overlay the average daily profile for general patterns.

Time Series Plot

# `scales = "free_y"` scales each facet to its own data range.
p_timeseries <- df %>%
  pivot_longer(cols = c("occupancy", "total_energy"), names_to = "metric", values_to = "value") %>%
  ggplot(aes(x = Timestamp, y = value, color = metric)) +
  geom_line(alpha = 0.8) +
  facet_wrap(~metric, scales = "free_y", ncol = 1) +
  labs(title = "Library Occupancy and Energy Consumption Over Time", x = "Date", y = "Value") +
  scale_x_datetime(labels = date_format("%b %d")) +
  theme_minimal() +
  theme(legend.position = "none")

print(p_timeseries)

## Warning: Removed 15181 rows containing missing values or values outside the scale range
## (`geom_line()`).

dapat dilihat bahwa pada total_energy, data di record dari bulan januari-mei tetapi occupancy hanya sekitar bulan februari, maka kita hanya membandingkan dengan data yang ada pada data occupancy agar grafik terlihat jelas

df_occ <- df %>%
  filter(!is.na(occupancy))

ggplot(df_occ, aes(x = Timestamp)) +
  geom_line(aes(y = occupancy, color = "Occupancy (devices)"), linewidth = 0.7) +
  geom_line(aes(y = total_energy, color = "Total Energy"), linewidth = 0.7) +
  scale_y_continuous(
    name = "Devices connected",
    sec.axis = sec_axis(~ ., name = "Energy (sum of meters)")
  ) +
  labs(
    title = "Occupancy vs Total Energy (Periode dengan Data Occupancy)",
    x = "Timestamp",
    color = "Legend"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    legend.position = "bottom"
  )

b)Scatter Plot

p_scatter <- ggplot(df, aes(x = occupancy, y = total_energy)) +
  geom_point(alpha = 0.4, color = "steelblue") +
  geom_smooth(method = "lm", color = "firebrick") +
  labs(title = "Relationship Between Occupancy and Energy Consumption",
       x = "Occupancy (Associated WiFi Clients)", y = "Total Energy Consumption (kW)") +
  # Set axis limits to the exact min/max of the data
  coord_cartesian(
    xlim = range(df$occupancy, na.rm = TRUE),
    ylim = range(df$total_energy, na.rm = TRUE)
  ) +
  theme_minimal()

print(p_scatter)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 15181 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 15181 rows containing missing values or values outside the scale range
## (`geom_point()`).

c) Daily Profiles (24h)

# Ambil hanya baris dengan occupancy tidak NA
df_occ <- df %>% filter(!is.na(occupancy))

# Buat kolom tambahan (kalau belum ada)
df_occ <- df_occ %>%
  mutate(
    date = as.Date(Timestamp),
    time_of_day = hms::as_hms(Timestamp)
  )

# Hitung rata-rata harian per jam/menit (dari data occupancy yang ada)
avg_profile <- df_occ %>%
  group_by(time_of_day) %>%
  summarise(
    avg_occupancy = mean(occupancy, na.rm = TRUE),
    avg_energy = mean(total_energy, na.rm = TRUE),
    .groups = "drop"
  )

# Occupancy Daily Profile
p1 <- ggplot(df_occ, aes(x = time_of_day, y = occupancy)) +
  geom_line(aes(group = date), alpha = 0.15, color = "gray") +
  geom_line(data = avg_profile, aes(y = avg_occupancy), color = "dodgerblue", linewidth = 1.2) +
  labs(title = "Daily Occupancy Profiles (Only Period with Occupancy Data)",
       x = "Time of Day", y = "Occupancy") +
  coord_cartesian(ylim = range(df_occ$occupancy, na.rm = TRUE)) +
  scale_x_time(labels = time_format("%H:%M")) +
  theme_minimal()

# Energy Daily Profile
p2 <- ggplot(df_occ, aes(x = time_of_day, y = total_energy)) +
  geom_line(aes(group = date), alpha = 0.15, color = "gray") +
  geom_line(data = avg_profile, aes(y = avg_energy), color = "orangered", linewidth = 1.2) +
  labs(title = "Daily Energy Profiles (Same Period as Occupancy)",
       x = "Time of Day", y = "Total Energy (kW)") +
  coord_cartesian(ylim = range(df_occ$total_energy, na.rm = TRUE)) +
  scale_x_time(labels = time_format("%H:%M")) +
  theme_minimal()

gridExtra::grid.arrange(p1, p2, ncol = 1)

5. Analysis - Identify peak hours of occupancy in the library. - Examine whether occupancy significantly influences energy consumption. - Highlight cases where energy consumption does not align with occupancy. a) identifikasi peak hours

peak_hours <- df %>%
  mutate(hour = floor_date(Timestamp, "hour")) %>%   # ubah ke jam (flooring)
  group_by(hour) %>%
  summarise(mean_occ = mean(occupancy, na.rm = TRUE), .groups = "drop") %>%
  mutate(hour_of_day = hour(hour)) %>%               # ambil jam dari 0-23
  group_by(hour_of_day) %>%
  summarise(mean_occ = mean(mean_occ, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(mean_occ))

print("Peak hours (by average occupancy):")

## [1] "Peak hours (by average occupancy):"

print(head(peak_hours, 10))

## # A tibble: 10 × 2
##    hour_of_day mean_occ
##          <int>    <dbl>
##  1          15     212.
##  2          14     205.
##  3          16     198.
##  4          13     190.
##  5          12     172.
##  6          17     167.
##  7          11     150.
##  8          18     133.
##  9          19     110.
## 10          10     105.

Correlation Analysis We calculate the Pearson correlation coefficient to measure the strength and direction of the linear relationship.

corr <- cor(df$occupancy, df$total_energy, use = "complete.obs")
cat("Korelasi antara occupancy dan total_energy:", corr, "\n")

## Korelasi antara occupancy dan total_energy: 0.8780428

Linear Regression Analysis We build a linear model (\(Energy = \beta_0 + \beta_1 \times Occupancy + \epsilon\)) to predict energy consumption from occupancy.

model <- lm(total_energy ~ occupancy, data = df)
summary(model)

## 
## Call:
## lm(formula = total_energy ~ occupancy, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.298 -15.563   1.756  16.206  50.921 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.037e+02  4.947e-01   209.6   <2e-16 ***
## occupancy   4.658e-01  4.185e-03   111.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.92 on 3681 degrees of freedom
##   (15181 observations deleted due to missingness)
## Multiple R-squared:  0.771,  Adjusted R-squared:  0.7709 
## F-statistic: 1.239e+04 on 1 and 3681 DF,  p-value: < 2.2e-16

Time Series Decomposition We decompose the energy time series into its trend, seasonal, and random components to understand underlying patterns. The frequency is set to 144 (6 intervals/hour * 24 hours) to capture the daily pattern.

energy_ts <- ts(df$total_energy, frequency = 144)
stl_decomposition <- stl(energy_ts, s.window = "periodic")
plot(stl_decomposition)

e) Time Series Regression (ARIMAX Model) We use an ARIMAX model to account for autocorrelation in the time series data. This provides a more robust estimate of occupancy’s effect on energy.

# This can take a minute to run
arimax_model <- auto.arima(
  df$total_energy,
  xreg = as.matrix(df$occupancy)
)
summary(arimax_model)

## Series: df$total_energy 
## Regression with ARIMA(2,1,4) errors 
## 
## Coefficients:
##          ar1     ar2      ma1      ma2     ma3     ma4    xreg
##       0.5135  0.2978  -0.8987  -0.2031  0.1635  0.0756  0.3038
## s.e.  0.3942  0.3505   0.3926   0.5047  0.1106  0.0401  0.0230
## 
## sigma^2 = 16.87:  log likelihood = -13432.99
## AIC=26881.97   AICc=26882.01   BIC=26931.66
## 
## Training set error measures:
##                      ME     RMSE      MAE        MPE     MAPE     MASE
## Training set 0.02625732 9.292172 6.978032 -0.4578907 5.854358 1.249609
##                    ACF1
## Training set 0.00173451

Kasus yang tidak sejalan dengan hunian

df <- df %>%
  mutate(
    energy_per_device = total_energy / ifelse(occupancy == 0, NA, occupancy),  # hindari pembagian 0
    high_epd = energy_per_device > quantile(energy_per_device, 0.99, na.rm = TRUE),
    low_occ = occupancy < quantile(occupancy, 0.5, na.rm = TRUE)
  )

mismatch_cases <- df %>%
  filter(high_epd & low_occ)

cat("Number of mismatch cases (high energy per device & occupancy below median):", 
    nrow(mismatch_cases), "\n")

## Number of mismatch cases (high energy per device & occupancy below median): 36

print(mismatch_cases)

## # A tibble: 36 × 9
##    Timestamp           occupancy library1 library2 library3 total_energy
##    <dttm>                  <dbl>    <dbl>    <dbl>    <dbl>        <dbl>
##  1 2020-02-01 05:40:00      2          14       13       49           76
##  2 2020-02-01 05:50:00      2          14       14       46           74
##  3 2020-02-01 06:50:00      2          14       11       51           76
##  4 2020-02-01 07:00:00      2.07       13       11       44           68
##  5 2020-02-01 07:30:00      2.14       13       12       50           75
##  6 2020-02-01 08:00:00      2.71       12       10       76           98
##  7 2020-02-01 08:10:00      2.71       13       11       71           95
##  8 2020-02-01 08:20:00      2.93       12        9       82          103
##  9 2020-02-02 03:50:00      2.71       16       23       71          110
## 10 2020-02-02 05:50:00      2.5        15       26       46           87
## # ℹ 26 more rows
## # ℹ 3 more variables: energy_per_device <dbl>, high_epd <lgl>, low_occ <lgl>

Weekend vs Weekday Comparison

Separate the dataset into weekdays (Mon–Fri) and weekends (Sat–Sun).
Reproduce the previous visualizations for both categories.
Compare and interpret the differences.

[dihitung dengan data yang terdapat bulan januari-mei]

df$day_of_week <- weekdays(as.Date(df$Timestamp))

# WEEKDAYS DATASET
df_weekdays <- df %>%
  filter(!(day_of_week %in% c("Saturday", "Sunday")))

# WEEKEND DATASET
df_weekends <- df %>%
  filter(day_of_week %in% c("Saturday", "Sunday"))

total_weekday <- nrow(df_weekdays)
total_weekend <- nrow(df_weekends)

cat("Total Weekday data:", total_weekday, "\n")

## Total Weekday data: 13392

cat("Total Weekend data:", total_weekend, "\n")

## Total Weekend data: 5472

[Data menyesuaikan occupancy]

df_occ$day_of_week <- weekdays(as.Date(df_occ$Timestamp))

# WEEKDAYS DATASET
df_weekdays_occ <- df_occ %>%
  filter(!(day_of_week %in% c("Saturday", "Sunday")))

# WEEKEND DATASET
df_weekends_occ <- df_occ %>%
  filter(day_of_week %in% c("Saturday", "Sunday"))

total_weekday_occ <- nrow(df_weekdays_occ)
total_weekend_occ <- nrow(df_weekends_occ)

cat("Total Weekday data (menyesuaikan):", total_weekday_occ, "\n")

## Total Weekday data (menyesuaikan): 2531

cat("Total Weekend data (menyesuaikan):", total_weekend_occ, "\n")

## Total Weekend data (menyesuaikan): 1152

Reproduce the previous visualizations for both categories a) Average daily occupancy profile (24h) for weekday vs weekend

df_occ <- df_occ %>%
  mutate(IsWeekend = ifelse(day_of_week %in% c("Saturday", "Sunday"), "Weekend", "Weekday"),
         TimePOSIX = as.POSIXct(format(Timestamp, "%H:%M:%S"), format = "%H:%M:%S"))

ggplot(df_occ, aes(x = TimePOSIX, y = occupancy,
                                      group = IsWeekend, color = IsWeekend)) +
  stat_summary(fun = mean, geom = "line", size = 1.2) +
  scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hour") +
  scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) +
  labs(title = "Weekday vs Weekend - Average Daily Occupancy (24h)",
       x = "Jam", y = "Rata-rata Occupancy", color = "") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

b) Daily profiles

df_occ <- df_occ %>%
  mutate(hour = as.integer(format(Timestamp, "%H")),
         category = ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"))

ggplot(df_occ, aes(x = hour, y = total_energy, 
                       group = as.Date(Timestamp), color = category)) +
  geom_line(alpha = 0.3) +
  labs(title = "Daily Profiles: Weekday vs Weekend",
       x = "Hour of Day", y = "Total Energy") +
  scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) +
  theme_minimal()

c) Rata-rata energi Weekday vs Weekend

df_occ <- df_occ %>%
  mutate(hour = as.integer(format(Timestamp, "%H")),
         category = ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"))

ggplot(df_occ, aes(x = hour, y = total_energy, color = category)) +
  stat_summary(fun = mean, geom = "line", size = 1.2) +
  scale_x_continuous(breaks = seq(0, 23, by = 2)) +
  labs(title = "Weekday vs Weekend - Rata-rata Profil Energi Harian",
       x = "Jam", y = "Rata-rata Total Energy", color = "") +
  scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) +
  theme_minimal()

d) Time series plot (Total Energy & Occupancy)

df_occ <- df_occ %>%
  mutate(category = ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"))

ggplot(df_occ, aes(x = Timestamp)) +
  geom_line(aes(y = total_energy, color = "Total Energy")) +
  geom_line(aes(y = occupancy, color = "Occupancy")) +
  facet_wrap(~category, scales = "free_x", ncol = 1) +
  labs(title = "Time Series: Weekday vs Weekend",
       x = "Time", y = "Value", color = "") +
  scale_color_manual(values = c("Total Energy" = "blue", "Occupancy" = "orange")) +
  theme_minimal()

e) Scatter plot Occupancy vs Energy (Weekday vs Weekend)

df_occ <- df_occ %>%
  mutate(category = factor(ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"),
                           levels = c("Weekday", "Weekend")))

ggplot(df_occ, aes(x = occupancy, y = total_energy, color = category)) + geom_point(alpha = 0.4) + labs(title = "Scatter Plot: Occupancy vs Energy (Weekday vs Weekend)", x = "Occupancy", y = "Total Energy", color = "Category") + scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) + theme_minimal()

f) Regresi sederhana (weekday vs weekend)

data_weekday <- df_occ %>%
  filter(category == "Weekday")
data_weekend <- df_occ %>%
  filter(category == "Weekend")

# Weekday regression
model_wd <- lm(total_energy ~ occupancy, data = data_weekday)
cat("Weekday Regression Summary:\n")

## Weekday Regression Summary:

summary(model_wd)

## 
## Call:
## lm(formula = total_energy ~ occupancy, data = data_weekday)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57.80 -14.91   1.05  15.99  50.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.043e+02  6.178e-01   168.9   <2e-16 ***
## occupancy   4.422e-01  4.631e-03    95.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.83 on 2529 degrees of freedom
## Multiple R-squared:  0.7829, Adjusted R-squared:  0.7828 
## F-statistic:  9120 on 1 and 2529 DF,  p-value: < 2.2e-16

# Weekend regression
model_we <- lm(total_energy ~ occupancy, data = data_weekend)
cat("\nWeekend Regression Summary:\n")

## 
## Weekend Regression Summary:

summary(model_we)

## 
## Call:
## lm(formula = total_energy ~ occupancy, data = data_weekend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.789 -11.249   0.603  11.546  49.059 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 95.824346   0.719133  133.25   <2e-16 ***
## occupancy    0.682753   0.009639   70.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.99 on 1150 degrees of freedom
## Multiple R-squared:  0.8135, Adjusted R-squared:  0.8134 
## F-statistic:  5017 on 1 and 1150 DF,  p-value: < 2.2e-16

Coding Library Energy Analysis - Datmin