Data Mining Kelompok 3 Anggota Kelompok. Ara Dina Mumtaza (5003231104). Dea Kharisma Nuril Ba’is (5003231109). Jasmine Sabrina Rahmaira Saputra (5003231114). Artera Dwi Septianingtyas (5003231118)

TAHAPAN

  1. Import library yang diperlukan dan load data
# 1. Import Library & Data
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(gridExtra) # For arranging plots
## Warning: package 'gridExtra' was built under R version 4.3.3
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(forecast)  # For time series models
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(scales)    # For formatting axis labels
## Warning: package 'scales' was built under R version 4.3.3
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggplot2)

Load Data

# 2. Input Data
wifi     <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/wifi.csv")
## Rows: 1883844 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Event Time, Uni, Building, Floor
## dbl  (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library1 <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/library1.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library2 <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/library2.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library3 <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/library3.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Mengecek data yang terbaca

head(wifi)
## # A tibble: 6 × 7
##   time                `Event Time` Associated Client Co…¹ Authenticated Client…²
##   <dttm>              <chr>                         <dbl>                  <dbl>
## 1 2020-02-01 00:02:12 Sat Feb 01 …                    184                    182
## 2 2020-02-01 00:02:12 Sat Feb 01 …                      6                      6
## 3 2020-02-01 00:02:12 Sat Feb 01 …                     18                     18
## 4 2020-02-01 00:02:12 Sat Feb 01 …                     23                     23
## 5 2020-02-01 00:02:12 Sat Feb 01 …                     45                     45
## 6 2020-02-01 00:02:12 Sat Feb 01 …                     16                     16
## # ℹ abbreviated names: ¹​`Associated Client Count`,
## #   ²​`Authenticated Client Count`
## # ℹ 3 more variables: Uni <chr>, Building <chr>, Floor <chr>
head(library1)
## # A tibble: 6 × 6
##   ts                  name              reading units cumulative  rate
##   <dttm>              <chr>               <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh      1489442    NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh      1489449     7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh      1489456     7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh      1489464     8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh      1489471     7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh      1489479     8
head(library2)
## # A tibble: 6 × 6
##   ts                  name               reading units cumulative  rate
##   <dttm>              <chr>                <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh      2129016    NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh      2129034    18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh      2129054    20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh      2129071    17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh      2129086    15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh      2129103    17
head(library3)
## # A tibble: 6 × 6
##   ts                  name               reading units cumulative  rate
##   <dttm>              <chr>                <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh      6914209    NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh      6914266    57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh      6914310    44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh      6914376    66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh      6914439    63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh      6914474    35
  1. Filter & Synchronize the Data
  1. Filter hanya Library pada wifi
wifi_library <- wifi %>%
  filter(grepl("Library", Building, ignore.case = TRUE)) %>%
  mutate(Timestamp = ymd_hms(time))

head(wifi_library)
## # A tibble: 6 × 8
##   time                `Event Time` Associated Client Co…¹ Authenticated Client…²
##   <dttm>              <chr>                         <dbl>                  <dbl>
## 1 2020-02-01 00:02:12 Sat Feb 01 …                      1                      0
## 2 2020-02-01 00:02:12 Sat Feb 01 …                     29                     28
## 3 2020-02-01 00:02:12 Sat Feb 01 …                      0                      0
## 4 2020-02-01 00:02:12 Sat Feb 01 …                      0                      0
## 5 2020-02-01 00:02:12 Sat Feb 01 …                      0                      0
## 6 2020-02-01 00:02:12 Sat Feb 01 …                     21                     20
## # ℹ abbreviated names: ¹​`Associated Client Count`,
## #   ²​`Authenticated Client Count`
## # ℹ 4 more variables: Uni <chr>, Building <chr>, Floor <chr>, Timestamp <dttm>
  1. Resample WiFi ke interval 10 menit (jawaban telah sesuai dengan komputasi python)
wifi_10 <- wifi_library %>%
  mutate(Timestamp = floor_date(Timestamp, "10 minutes")) %>%
  group_by(Timestamp) %>%
  summarise(occupancy = mean(`Associated Client Count`, na.rm = TRUE)) %>%
  ungroup()

print(wifi_10)
## # A tibble: 3,683 × 2
##    Timestamp           occupancy
##    <dttm>                  <dbl>
##  1 2020-02-01 00:00:00     13.1 
##  2 2020-02-01 00:10:00     11.8 
##  3 2020-02-01 00:20:00     10   
##  4 2020-02-01 00:30:00      8.79
##  5 2020-02-01 00:40:00      7.07
##  6 2020-02-01 00:50:00      6   
##  7 2020-02-01 01:00:00      5.29
##  8 2020-02-01 01:10:00      4.95
##  9 2020-02-01 01:20:00      4.86
## 10 2020-02-01 01:30:00      4.79
## # ℹ 3,673 more rows
  1. Resample masing-masing file library ke 10 menit dan menggabungkan data library serta menghitung total atau jumlah energi dalam satu dataframe “energy”
library1_10 <- library1 %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(library1 = mean(rate, na.rm = TRUE), .groups = "drop")

library2_10 <- library2 %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(library2 = mean(rate, na.rm = TRUE), .groups = "drop")

library3_10 <- library3 %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(library3 = mean(rate, na.rm = TRUE), .groups = "drop")

# Gabungkan jadi satu data frame
energy <- library1_10 %>%
  full_join(library2_10, by = "ts") %>%
  full_join(library3_10, by = "ts") %>%
  mutate(total_energy = library1 + library2 + library3)

print(energy)
## # A tibble: 18,864 × 5
##    ts                  library1 library2 library3 total_energy
##    <dttm>                 <dbl>    <dbl>    <dbl>        <dbl>
##  1 2020-01-01 00:00:00      NaN      NaN      NaN          NaN
##  2 2020-01-01 00:10:00        7       18       57           82
##  3 2020-01-01 00:20:00        7       20       44           71
##  4 2020-01-01 00:30:00        8       17       66           91
##  5 2020-01-01 00:40:00        7       15       63           85
##  6 2020-01-01 00:50:00        8       17       35           60
##  7 2020-01-01 01:00:00        7       17       47           71
##  8 2020-01-01 01:10:00        8       17       55           80
##  9 2020-01-01 01:20:00        7       17       34           58
## 10 2020-01-01 01:30:00        8       17       41           66
## # ℹ 18,854 more rows
  1. Data Cleaning

Step 1. Cek apakah terdapat missing value pada tiap data

sum(is.na(wifi_10))
## [1] 0
sum(is.na(library1_10))
## [1] 3047
sum(is.na(library2_10))
## [1] 3047
sum(is.na(library3_10))
## [1] 3047

Note: missing value hanya ada pada data library (sehingga data wifi tidak perlu dikomputasi)

fill_first144 <- function(x) {
  x[is.na(x)] <- mean(x[1:144], na.rm = TRUE)
  return(x)
}

library1_10$library1 <- fill_first144(library1_10$library1)
library2_10$library2 <- fill_first144(library2_10$library2)
library3_10$library3 <- fill_first144(library3_10$library3)

Mengecek apakah missing value sudah hilang

sum(is.na(library1_10))
## [1] 0
sum(is.na(library2_10))
## [1] 0
sum(is.na(library2_10))
## [1] 0

Standarisasi timestamp dan menggabungkan dataset serta Menggabungkan wifi dan library jadi satu data frame

df <- wifi_10 %>%
  full_join(library1_10, by = c("Timestamp" = "ts")) %>%
  full_join(library2_10, by = c("Timestamp" = "ts")) %>%
  full_join(library3_10, by = c("Timestamp" = "ts")) %>%
  mutate(across(c(library1, library2, library3), ~ replace_na(., 0))) %>%
  mutate(total_energy = library1 + library2 + library3)

print(df)
## # A tibble: 18,864 × 6
##    Timestamp           occupancy library1 library2 library3 total_energy
##    <dttm>                  <dbl>    <dbl>    <dbl>    <dbl>        <dbl>
##  1 2020-02-01 00:00:00     13.1      7.38     17.9     39.6         64.8
##  2 2020-02-01 00:10:00     11.8     13        11       73           97  
##  3 2020-02-01 00:20:00     10       13        11       72           96  
##  4 2020-02-01 00:30:00      8.79    11        15       74          100  
##  5 2020-02-01 00:40:00      7.07    11        13       65           89  
##  6 2020-02-01 00:50:00      6       11        16       64           91  
##  7 2020-02-01 01:00:00      5.29    10        16       64           90  
##  8 2020-02-01 01:10:00      4.95    11        15       66           92  
##  9 2020-02-01 01:20:00      4.86    11        15       61           87  
## 10 2020-02-01 01:30:00      4.79    12        13       58           83  
## # ℹ 18,854 more rows
  1. Visualization of Occupancy–Energy Relationship
  1. Time Series Plot
# `scales = "free_y"` scales each facet to its own data range.
p_timeseries <- df %>%
  pivot_longer(cols = c("occupancy", "total_energy"), names_to = "metric", values_to = "value") %>%
  ggplot(aes(x = Timestamp, y = value, color = metric)) +
  geom_line(alpha = 0.8) +
  facet_wrap(~metric, scales = "free_y", ncol = 1) +
  labs(title = "Library Occupancy and Energy Consumption Over Time", x = "Date", y = "Value") +
  scale_x_datetime(labels = date_format("%b %d")) +
  theme_minimal() +
  theme(legend.position = "none")

print(p_timeseries)
## Warning: Removed 15181 rows containing missing values or values outside the scale range
## (`geom_line()`).

dapat dilihat bahwa pada total_energy, data di record dari bulan januari-mei tetapi occupancy hanya sekitar bulan februari, maka kita hanya membandingkan dengan data yang ada pada data occupancy agar grafik terlihat jelas

df_occ <- df %>%
  filter(!is.na(occupancy))

ggplot(df_occ, aes(x = Timestamp)) +
  geom_line(aes(y = occupancy, color = "Occupancy (devices)"), linewidth = 0.7) +
  geom_line(aes(y = total_energy, color = "Total Energy"), linewidth = 0.7) +
  scale_y_continuous(
    name = "Devices connected",
    sec.axis = sec_axis(~ ., name = "Energy (sum of meters)")
  ) +
  labs(
    title = "Occupancy vs Total Energy (Periode dengan Data Occupancy)",
    x = "Timestamp",
    color = "Legend"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    legend.position = "bottom"
  )

b)Scatter Plot

p_scatter <- ggplot(df, aes(x = occupancy, y = total_energy)) +
  geom_point(alpha = 0.4, color = "steelblue") +
  geom_smooth(method = "lm", color = "firebrick") +
  labs(title = "Relationship Between Occupancy and Energy Consumption",
       x = "Occupancy (Associated WiFi Clients)", y = "Total Energy Consumption (kW)") +
  # Set axis limits to the exact min/max of the data
  coord_cartesian(
    xlim = range(df$occupancy, na.rm = TRUE),
    ylim = range(df$total_energy, na.rm = TRUE)
  ) +
  theme_minimal()

print(p_scatter)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 15181 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 15181 rows containing missing values or values outside the scale range
## (`geom_point()`).

c) Daily Profiles (24h)

# Ambil hanya baris dengan occupancy tidak NA
df_occ <- df %>% filter(!is.na(occupancy))

# Buat kolom tambahan (kalau belum ada)
df_occ <- df_occ %>%
  mutate(
    date = as.Date(Timestamp),
    time_of_day = hms::as_hms(Timestamp)
  )

# Hitung rata-rata harian per jam/menit (dari data occupancy yang ada)
avg_profile <- df_occ %>%
  group_by(time_of_day) %>%
  summarise(
    avg_occupancy = mean(occupancy, na.rm = TRUE),
    avg_energy = mean(total_energy, na.rm = TRUE),
    .groups = "drop"
  )

# Occupancy Daily Profile
p1 <- ggplot(df_occ, aes(x = time_of_day, y = occupancy)) +
  geom_line(aes(group = date), alpha = 0.15, color = "gray") +
  geom_line(data = avg_profile, aes(y = avg_occupancy), color = "dodgerblue", linewidth = 1.2) +
  labs(title = "Daily Occupancy Profiles (Only Period with Occupancy Data)",
       x = "Time of Day", y = "Occupancy") +
  coord_cartesian(ylim = range(df_occ$occupancy, na.rm = TRUE)) +
  scale_x_time(labels = time_format("%H:%M")) +
  theme_minimal()

# Energy Daily Profile
p2 <- ggplot(df_occ, aes(x = time_of_day, y = total_energy)) +
  geom_line(aes(group = date), alpha = 0.15, color = "gray") +
  geom_line(data = avg_profile, aes(y = avg_energy), color = "orangered", linewidth = 1.2) +
  labs(title = "Daily Energy Profiles (Same Period as Occupancy)",
       x = "Time of Day", y = "Total Energy (kW)") +
  coord_cartesian(ylim = range(df_occ$total_energy, na.rm = TRUE)) +
  scale_x_time(labels = time_format("%H:%M")) +
  theme_minimal()

gridExtra::grid.arrange(p1, p2, ncol = 1)

5. Analysis - Identify peak hours of occupancy in the library. - Examine whether occupancy significantly influences energy consumption. - Highlight cases where energy consumption does not align with occupancy. a) identifikasi peak hours

peak_hours <- df %>%
  mutate(hour = floor_date(Timestamp, "hour")) %>%   # ubah ke jam (flooring)
  group_by(hour) %>%
  summarise(mean_occ = mean(occupancy, na.rm = TRUE), .groups = "drop") %>%
  mutate(hour_of_day = hour(hour)) %>%               # ambil jam dari 0-23
  group_by(hour_of_day) %>%
  summarise(mean_occ = mean(mean_occ, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(mean_occ))

print("Peak hours (by average occupancy):")
## [1] "Peak hours (by average occupancy):"
print(head(peak_hours, 10))
## # A tibble: 10 × 2
##    hour_of_day mean_occ
##          <int>    <dbl>
##  1          15     212.
##  2          14     205.
##  3          16     198.
##  4          13     190.
##  5          12     172.
##  6          17     167.
##  7          11     150.
##  8          18     133.
##  9          19     110.
## 10          10     105.
  1. Correlation Analysis We calculate the Pearson correlation coefficient to measure the strength and direction of the linear relationship.
corr <- cor(df$occupancy, df$total_energy, use = "complete.obs")
cat("Korelasi antara occupancy dan total_energy:", corr, "\n")
## Korelasi antara occupancy dan total_energy: 0.8780428
  1. Linear Regression Analysis We build a linear model (\(Energy = \beta_0 + \beta_1 \times Occupancy + \epsilon\)) to predict energy consumption from occupancy.
model <- lm(total_energy ~ occupancy, data = df)
summary(model)
## 
## Call:
## lm(formula = total_energy ~ occupancy, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.298 -15.563   1.756  16.206  50.921 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.037e+02  4.947e-01   209.6   <2e-16 ***
## occupancy   4.658e-01  4.185e-03   111.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.92 on 3681 degrees of freedom
##   (15181 observations deleted due to missingness)
## Multiple R-squared:  0.771,  Adjusted R-squared:  0.7709 
## F-statistic: 1.239e+04 on 1 and 3681 DF,  p-value: < 2.2e-16
  1. Time Series Decomposition We decompose the energy time series into its trend, seasonal, and random components to understand underlying patterns. The frequency is set to 144 (6 intervals/hour * 24 hours) to capture the daily pattern.
energy_ts <- ts(df$total_energy, frequency = 144)
stl_decomposition <- stl(energy_ts, s.window = "periodic")
plot(stl_decomposition)

e) Time Series Regression (ARIMAX Model) We use an ARIMAX model to account for autocorrelation in the time series data. This provides a more robust estimate of occupancy’s effect on energy.

# This can take a minute to run
arimax_model <- auto.arima(
  df$total_energy,
  xreg = as.matrix(df$occupancy)
)
summary(arimax_model)
## Series: df$total_energy 
## Regression with ARIMA(2,1,4) errors 
## 
## Coefficients:
##          ar1     ar2      ma1      ma2     ma3     ma4    xreg
##       0.5135  0.2978  -0.8987  -0.2031  0.1635  0.0756  0.3038
## s.e.  0.3942  0.3505   0.3926   0.5047  0.1106  0.0401  0.0230
## 
## sigma^2 = 16.87:  log likelihood = -13432.99
## AIC=26881.97   AICc=26882.01   BIC=26931.66
## 
## Training set error measures:
##                      ME     RMSE      MAE        MPE     MAPE     MASE
## Training set 0.02625732 9.292172 6.978032 -0.4578907 5.854358 1.249609
##                    ACF1
## Training set 0.00173451
  1. Kasus yang tidak sejalan dengan hunian
df <- df %>%
  mutate(
    energy_per_device = total_energy / ifelse(occupancy == 0, NA, occupancy),  # hindari pembagian 0
    high_epd = energy_per_device > quantile(energy_per_device, 0.99, na.rm = TRUE),
    low_occ = occupancy < quantile(occupancy, 0.5, na.rm = TRUE)
  )

mismatch_cases <- df %>%
  filter(high_epd & low_occ)

cat("Number of mismatch cases (high energy per device & occupancy below median):", 
    nrow(mismatch_cases), "\n")
## Number of mismatch cases (high energy per device & occupancy below median): 36
print(mismatch_cases)
## # A tibble: 36 × 9
##    Timestamp           occupancy library1 library2 library3 total_energy
##    <dttm>                  <dbl>    <dbl>    <dbl>    <dbl>        <dbl>
##  1 2020-02-01 05:40:00      2          14       13       49           76
##  2 2020-02-01 05:50:00      2          14       14       46           74
##  3 2020-02-01 06:50:00      2          14       11       51           76
##  4 2020-02-01 07:00:00      2.07       13       11       44           68
##  5 2020-02-01 07:30:00      2.14       13       12       50           75
##  6 2020-02-01 08:00:00      2.71       12       10       76           98
##  7 2020-02-01 08:10:00      2.71       13       11       71           95
##  8 2020-02-01 08:20:00      2.93       12        9       82          103
##  9 2020-02-02 03:50:00      2.71       16       23       71          110
## 10 2020-02-02 05:50:00      2.5        15       26       46           87
## # ℹ 26 more rows
## # ℹ 3 more variables: energy_per_device <dbl>, high_epd <lgl>, low_occ <lgl>
  1. Weekend vs Weekday Comparison

[dihitung dengan data yang terdapat bulan januari-mei]

df$day_of_week <- weekdays(as.Date(df$Timestamp))

# WEEKDAYS DATASET
df_weekdays <- df %>%
  filter(!(day_of_week %in% c("Saturday", "Sunday")))

# WEEKEND DATASET
df_weekends <- df %>%
  filter(day_of_week %in% c("Saturday", "Sunday"))

total_weekday <- nrow(df_weekdays)
total_weekend <- nrow(df_weekends)
cat("Total Weekday data:", total_weekday, "\n")
## Total Weekday data: 13392
cat("Total Weekend data:", total_weekend, "\n")
## Total Weekend data: 5472

[Data menyesuaikan occupancy]

df_occ$day_of_week <- weekdays(as.Date(df_occ$Timestamp))

# WEEKDAYS DATASET
df_weekdays_occ <- df_occ %>%
  filter(!(day_of_week %in% c("Saturday", "Sunday")))

# WEEKEND DATASET
df_weekends_occ <- df_occ %>%
  filter(day_of_week %in% c("Saturday", "Sunday"))

total_weekday_occ <- nrow(df_weekdays_occ)
total_weekend_occ <- nrow(df_weekends_occ)
cat("Total Weekday data (menyesuaikan):", total_weekday_occ, "\n")
## Total Weekday data (menyesuaikan): 2531
cat("Total Weekend data (menyesuaikan):", total_weekend_occ, "\n")
## Total Weekend data (menyesuaikan): 1152

Reproduce the previous visualizations for both categories a) Average daily occupancy profile (24h) for weekday vs weekend

df_occ <- df_occ %>%
  mutate(IsWeekend = ifelse(day_of_week %in% c("Saturday", "Sunday"), "Weekend", "Weekday"),
         TimePOSIX = as.POSIXct(format(Timestamp, "%H:%M:%S"), format = "%H:%M:%S"))

ggplot(df_occ, aes(x = TimePOSIX, y = occupancy,
                                      group = IsWeekend, color = IsWeekend)) +
  stat_summary(fun = mean, geom = "line", size = 1.2) +
  scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hour") +
  scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) +
  labs(title = "Weekday vs Weekend - Average Daily Occupancy (24h)",
       x = "Jam", y = "Rata-rata Occupancy", color = "") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

b) Daily profiles

df_occ <- df_occ %>%
  mutate(hour = as.integer(format(Timestamp, "%H")),
         category = ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"))

ggplot(df_occ, aes(x = hour, y = total_energy, 
                       group = as.Date(Timestamp), color = category)) +
  geom_line(alpha = 0.3) +
  labs(title = "Daily Profiles: Weekday vs Weekend",
       x = "Hour of Day", y = "Total Energy") +
  scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) +
  theme_minimal()

c) Rata-rata energi Weekday vs Weekend

df_occ <- df_occ %>%
  mutate(hour = as.integer(format(Timestamp, "%H")),
         category = ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"))

ggplot(df_occ, aes(x = hour, y = total_energy, color = category)) +
  stat_summary(fun = mean, geom = "line", size = 1.2) +
  scale_x_continuous(breaks = seq(0, 23, by = 2)) +
  labs(title = "Weekday vs Weekend - Rata-rata Profil Energi Harian",
       x = "Jam", y = "Rata-rata Total Energy", color = "") +
  scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) +
  theme_minimal()

d) Time series plot (Total Energy & Occupancy)

df_occ <- df_occ %>%
  mutate(category = ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"))

ggplot(df_occ, aes(x = Timestamp)) +
  geom_line(aes(y = total_energy, color = "Total Energy")) +
  geom_line(aes(y = occupancy, color = "Occupancy")) +
  facet_wrap(~category, scales = "free_x", ncol = 1) +
  labs(title = "Time Series: Weekday vs Weekend",
       x = "Time", y = "Value", color = "") +
  scale_color_manual(values = c("Total Energy" = "blue", "Occupancy" = "orange")) +
  theme_minimal()

e) Scatter plot Occupancy vs Energy (Weekday vs Weekend)

df_occ <- df_occ %>%
  mutate(category = factor(ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"),
                           levels = c("Weekday", "Weekend")))

ggplot(df_occ, aes(x = occupancy, y = total_energy, color = category)) + geom_point(alpha = 0.4) + labs(title = "Scatter Plot: Occupancy vs Energy (Weekday vs Weekend)", x = "Occupancy", y = "Total Energy", color = "Category") + scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) + theme_minimal()

f) Regresi sederhana (weekday vs weekend)

data_weekday <- df_occ %>%
  filter(category == "Weekday")
data_weekend <- df_occ %>%
  filter(category == "Weekend")

# Weekday regression
model_wd <- lm(total_energy ~ occupancy, data = data_weekday)
cat("Weekday Regression Summary:\n")
## Weekday Regression Summary:
summary(model_wd)
## 
## Call:
## lm(formula = total_energy ~ occupancy, data = data_weekday)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57.80 -14.91   1.05  15.99  50.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.043e+02  6.178e-01   168.9   <2e-16 ***
## occupancy   4.422e-01  4.631e-03    95.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.83 on 2529 degrees of freedom
## Multiple R-squared:  0.7829, Adjusted R-squared:  0.7828 
## F-statistic:  9120 on 1 and 2529 DF,  p-value: < 2.2e-16
# Weekend regression
model_we <- lm(total_energy ~ occupancy, data = data_weekend)
cat("\nWeekend Regression Summary:\n")
## 
## Weekend Regression Summary:
summary(model_we)
## 
## Call:
## lm(formula = total_energy ~ occupancy, data = data_weekend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.789 -11.249   0.603  11.546  49.059 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 95.824346   0.719133  133.25   <2e-16 ***
## occupancy    0.682753   0.009639   70.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.99 on 1150 degrees of freedom
## Multiple R-squared:  0.8135, Adjusted R-squared:  0.8134 
## F-statistic:  5017 on 1 and 1150 DF,  p-value: < 2.2e-16