Data Mining Kelompok 3 Anggota Kelompok. Ara Dina Mumtaza (5003231104). Dea Kharisma Nuril Ba’is (5003231109). Jasmine Sabrina Rahmaira Saputra (5003231114). Artera Dwi Septianingtyas (5003231118)
TAHAPAN
# 1. Import Library & Data
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(gridExtra) # For arranging plots
## Warning: package 'gridExtra' was built under R version 4.3.3
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(forecast) # For time series models
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(scales) # For formatting axis labels
## Warning: package 'scales' was built under R version 4.3.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(ggplot2)
Load Data
# 2. Input Data
wifi <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/wifi.csv")
## Rows: 1883844 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Event Time, Uni, Building, Floor
## dbl (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library1 <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/library1.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library2 <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/library2.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library3 <- read_csv("C:/Users/Mrs. Ira/Downloads/DMV_Kel3/library3.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Mengecek data yang terbaca
head(wifi)
## # A tibble: 6 × 7
## time `Event Time` Associated Client Co…¹ Authenticated Client…²
## <dttm> <chr> <dbl> <dbl>
## 1 2020-02-01 00:02:12 Sat Feb 01 … 184 182
## 2 2020-02-01 00:02:12 Sat Feb 01 … 6 6
## 3 2020-02-01 00:02:12 Sat Feb 01 … 18 18
## 4 2020-02-01 00:02:12 Sat Feb 01 … 23 23
## 5 2020-02-01 00:02:12 Sat Feb 01 … 45 45
## 6 2020-02-01 00:02:12 Sat Feb 01 … 16 16
## # ℹ abbreviated names: ¹`Associated Client Count`,
## # ²`Authenticated Client Count`
## # ℹ 3 more variables: Uni <chr>, Building <chr>, Floor <chr>
head(library1)
## # A tibble: 6 × 6
## ts name reading units cumulative rate
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh 1489442 NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh 1489449 7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh 1489456 7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh 1489464 8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh 1489471 7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh 1489479 8
head(library2)
## # A tibble: 6 × 6
## ts name reading units cumulative rate
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh 2129016 NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh 2129034 18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh 2129054 20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh 2129071 17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh 2129086 15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh 2129103 17
head(library3)
## # A tibble: 6 × 6
## ts name reading units cumulative rate
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh 6914209 NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh 6914266 57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh 6914310 44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh 6914376 66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh 6914439 63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh 6914474 35
wifi_library <- wifi %>%
filter(grepl("Library", Building, ignore.case = TRUE)) %>%
mutate(Timestamp = ymd_hms(time))
head(wifi_library)
## # A tibble: 6 × 8
## time `Event Time` Associated Client Co…¹ Authenticated Client…²
## <dttm> <chr> <dbl> <dbl>
## 1 2020-02-01 00:02:12 Sat Feb 01 … 1 0
## 2 2020-02-01 00:02:12 Sat Feb 01 … 29 28
## 3 2020-02-01 00:02:12 Sat Feb 01 … 0 0
## 4 2020-02-01 00:02:12 Sat Feb 01 … 0 0
## 5 2020-02-01 00:02:12 Sat Feb 01 … 0 0
## 6 2020-02-01 00:02:12 Sat Feb 01 … 21 20
## # ℹ abbreviated names: ¹`Associated Client Count`,
## # ²`Authenticated Client Count`
## # ℹ 4 more variables: Uni <chr>, Building <chr>, Floor <chr>, Timestamp <dttm>
wifi_10 <- wifi_library %>%
mutate(Timestamp = floor_date(Timestamp, "10 minutes")) %>%
group_by(Timestamp) %>%
summarise(occupancy = mean(`Associated Client Count`, na.rm = TRUE)) %>%
ungroup()
print(wifi_10)
## # A tibble: 3,683 × 2
## Timestamp occupancy
## <dttm> <dbl>
## 1 2020-02-01 00:00:00 13.1
## 2 2020-02-01 00:10:00 11.8
## 3 2020-02-01 00:20:00 10
## 4 2020-02-01 00:30:00 8.79
## 5 2020-02-01 00:40:00 7.07
## 6 2020-02-01 00:50:00 6
## 7 2020-02-01 01:00:00 5.29
## 8 2020-02-01 01:10:00 4.95
## 9 2020-02-01 01:20:00 4.86
## 10 2020-02-01 01:30:00 4.79
## # ℹ 3,673 more rows
library1_10 <- library1 %>%
mutate(ts = floor_date(ts, "10 minutes")) %>%
group_by(ts) %>%
summarise(library1 = mean(rate, na.rm = TRUE), .groups = "drop")
library2_10 <- library2 %>%
mutate(ts = floor_date(ts, "10 minutes")) %>%
group_by(ts) %>%
summarise(library2 = mean(rate, na.rm = TRUE), .groups = "drop")
library3_10 <- library3 %>%
mutate(ts = floor_date(ts, "10 minutes")) %>%
group_by(ts) %>%
summarise(library3 = mean(rate, na.rm = TRUE), .groups = "drop")
# Gabungkan jadi satu data frame
energy <- library1_10 %>%
full_join(library2_10, by = "ts") %>%
full_join(library3_10, by = "ts") %>%
mutate(total_energy = library1 + library2 + library3)
print(energy)
## # A tibble: 18,864 × 5
## ts library1 library2 library3 total_energy
## <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 2020-01-01 00:00:00 NaN NaN NaN NaN
## 2 2020-01-01 00:10:00 7 18 57 82
## 3 2020-01-01 00:20:00 7 20 44 71
## 4 2020-01-01 00:30:00 8 17 66 91
## 5 2020-01-01 00:40:00 7 15 63 85
## 6 2020-01-01 00:50:00 8 17 35 60
## 7 2020-01-01 01:00:00 7 17 47 71
## 8 2020-01-01 01:10:00 8 17 55 80
## 9 2020-01-01 01:20:00 7 17 34 58
## 10 2020-01-01 01:30:00 8 17 41 66
## # ℹ 18,854 more rows
Step 1. Cek apakah terdapat missing value pada tiap data
sum(is.na(wifi_10))
## [1] 0
sum(is.na(library1_10))
## [1] 3047
sum(is.na(library2_10))
## [1] 3047
sum(is.na(library3_10))
## [1] 3047
Note: missing value hanya ada pada data library (sehingga data wifi tidak perlu dikomputasi)
fill_first144 <- function(x) {
x[is.na(x)] <- mean(x[1:144], na.rm = TRUE)
return(x)
}
library1_10$library1 <- fill_first144(library1_10$library1)
library2_10$library2 <- fill_first144(library2_10$library2)
library3_10$library3 <- fill_first144(library3_10$library3)
Mengecek apakah missing value sudah hilang
sum(is.na(library1_10))
## [1] 0
sum(is.na(library2_10))
## [1] 0
sum(is.na(library2_10))
## [1] 0
Standarisasi timestamp dan menggabungkan dataset serta Menggabungkan wifi dan library jadi satu data frame
df <- wifi_10 %>%
full_join(library1_10, by = c("Timestamp" = "ts")) %>%
full_join(library2_10, by = c("Timestamp" = "ts")) %>%
full_join(library3_10, by = c("Timestamp" = "ts")) %>%
mutate(across(c(library1, library2, library3), ~ replace_na(., 0))) %>%
mutate(total_energy = library1 + library2 + library3)
print(df)
## # A tibble: 18,864 × 6
## Timestamp occupancy library1 library2 library3 total_energy
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-02-01 00:00:00 13.1 7.38 17.9 39.6 64.8
## 2 2020-02-01 00:10:00 11.8 13 11 73 97
## 3 2020-02-01 00:20:00 10 13 11 72 96
## 4 2020-02-01 00:30:00 8.79 11 15 74 100
## 5 2020-02-01 00:40:00 7.07 11 13 65 89
## 6 2020-02-01 00:50:00 6 11 16 64 91
## 7 2020-02-01 01:00:00 5.29 10 16 64 90
## 8 2020-02-01 01:10:00 4.95 11 15 66 92
## 9 2020-02-01 01:20:00 4.86 11 15 61 87
## 10 2020-02-01 01:30:00 4.79 12 13 58 83
## # ℹ 18,854 more rows
# `scales = "free_y"` scales each facet to its own data range.
p_timeseries <- df %>%
pivot_longer(cols = c("occupancy", "total_energy"), names_to = "metric", values_to = "value") %>%
ggplot(aes(x = Timestamp, y = value, color = metric)) +
geom_line(alpha = 0.8) +
facet_wrap(~metric, scales = "free_y", ncol = 1) +
labs(title = "Library Occupancy and Energy Consumption Over Time", x = "Date", y = "Value") +
scale_x_datetime(labels = date_format("%b %d")) +
theme_minimal() +
theme(legend.position = "none")
print(p_timeseries)
## Warning: Removed 15181 rows containing missing values or values outside the scale range
## (`geom_line()`).
dapat dilihat bahwa pada total_energy, data di record dari bulan
januari-mei tetapi occupancy hanya sekitar bulan februari, maka kita
hanya membandingkan dengan data yang ada pada data occupancy agar grafik
terlihat jelas
df_occ <- df %>%
filter(!is.na(occupancy))
ggplot(df_occ, aes(x = Timestamp)) +
geom_line(aes(y = occupancy, color = "Occupancy (devices)"), linewidth = 0.7) +
geom_line(aes(y = total_energy, color = "Total Energy"), linewidth = 0.7) +
scale_y_continuous(
name = "Devices connected",
sec.axis = sec_axis(~ ., name = "Energy (sum of meters)")
) +
labs(
title = "Occupancy vs Total Energy (Periode dengan Data Occupancy)",
x = "Timestamp",
color = "Legend"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
legend.position = "bottom"
)
b)Scatter Plot
p_scatter <- ggplot(df, aes(x = occupancy, y = total_energy)) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_smooth(method = "lm", color = "firebrick") +
labs(title = "Relationship Between Occupancy and Energy Consumption",
x = "Occupancy (Associated WiFi Clients)", y = "Total Energy Consumption (kW)") +
# Set axis limits to the exact min/max of the data
coord_cartesian(
xlim = range(df$occupancy, na.rm = TRUE),
ylim = range(df$total_energy, na.rm = TRUE)
) +
theme_minimal()
print(p_scatter)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 15181 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 15181 rows containing missing values or values outside the scale range
## (`geom_point()`).
c) Daily Profiles (24h)
# Ambil hanya baris dengan occupancy tidak NA
df_occ <- df %>% filter(!is.na(occupancy))
# Buat kolom tambahan (kalau belum ada)
df_occ <- df_occ %>%
mutate(
date = as.Date(Timestamp),
time_of_day = hms::as_hms(Timestamp)
)
# Hitung rata-rata harian per jam/menit (dari data occupancy yang ada)
avg_profile <- df_occ %>%
group_by(time_of_day) %>%
summarise(
avg_occupancy = mean(occupancy, na.rm = TRUE),
avg_energy = mean(total_energy, na.rm = TRUE),
.groups = "drop"
)
# Occupancy Daily Profile
p1 <- ggplot(df_occ, aes(x = time_of_day, y = occupancy)) +
geom_line(aes(group = date), alpha = 0.15, color = "gray") +
geom_line(data = avg_profile, aes(y = avg_occupancy), color = "dodgerblue", linewidth = 1.2) +
labs(title = "Daily Occupancy Profiles (Only Period with Occupancy Data)",
x = "Time of Day", y = "Occupancy") +
coord_cartesian(ylim = range(df_occ$occupancy, na.rm = TRUE)) +
scale_x_time(labels = time_format("%H:%M")) +
theme_minimal()
# Energy Daily Profile
p2 <- ggplot(df_occ, aes(x = time_of_day, y = total_energy)) +
geom_line(aes(group = date), alpha = 0.15, color = "gray") +
geom_line(data = avg_profile, aes(y = avg_energy), color = "orangered", linewidth = 1.2) +
labs(title = "Daily Energy Profiles (Same Period as Occupancy)",
x = "Time of Day", y = "Total Energy (kW)") +
coord_cartesian(ylim = range(df_occ$total_energy, na.rm = TRUE)) +
scale_x_time(labels = time_format("%H:%M")) +
theme_minimal()
gridExtra::grid.arrange(p1, p2, ncol = 1)
5. Analysis - Identify peak hours of occupancy in the library. - Examine
whether occupancy significantly influences energy consumption. -
Highlight cases where energy consumption does not align with occupancy.
a) identifikasi peak hours
peak_hours <- df %>%
mutate(hour = floor_date(Timestamp, "hour")) %>% # ubah ke jam (flooring)
group_by(hour) %>%
summarise(mean_occ = mean(occupancy, na.rm = TRUE), .groups = "drop") %>%
mutate(hour_of_day = hour(hour)) %>% # ambil jam dari 0-23
group_by(hour_of_day) %>%
summarise(mean_occ = mean(mean_occ, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(mean_occ))
print("Peak hours (by average occupancy):")
## [1] "Peak hours (by average occupancy):"
print(head(peak_hours, 10))
## # A tibble: 10 × 2
## hour_of_day mean_occ
## <int> <dbl>
## 1 15 212.
## 2 14 205.
## 3 16 198.
## 4 13 190.
## 5 12 172.
## 6 17 167.
## 7 11 150.
## 8 18 133.
## 9 19 110.
## 10 10 105.
corr <- cor(df$occupancy, df$total_energy, use = "complete.obs")
cat("Korelasi antara occupancy dan total_energy:", corr, "\n")
## Korelasi antara occupancy dan total_energy: 0.8780428
model <- lm(total_energy ~ occupancy, data = df)
summary(model)
##
## Call:
## lm(formula = total_energy ~ occupancy, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.298 -15.563 1.756 16.206 50.921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.037e+02 4.947e-01 209.6 <2e-16 ***
## occupancy 4.658e-01 4.185e-03 111.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.92 on 3681 degrees of freedom
## (15181 observations deleted due to missingness)
## Multiple R-squared: 0.771, Adjusted R-squared: 0.7709
## F-statistic: 1.239e+04 on 1 and 3681 DF, p-value: < 2.2e-16
energy_ts <- ts(df$total_energy, frequency = 144)
stl_decomposition <- stl(energy_ts, s.window = "periodic")
plot(stl_decomposition)
e) Time Series Regression (ARIMAX Model) We use an ARIMAX model to
account for autocorrelation in the time series data. This provides a
more robust estimate of occupancy’s effect on energy.
# This can take a minute to run
arimax_model <- auto.arima(
df$total_energy,
xreg = as.matrix(df$occupancy)
)
summary(arimax_model)
## Series: df$total_energy
## Regression with ARIMA(2,1,4) errors
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4 xreg
## 0.5135 0.2978 -0.8987 -0.2031 0.1635 0.0756 0.3038
## s.e. 0.3942 0.3505 0.3926 0.5047 0.1106 0.0401 0.0230
##
## sigma^2 = 16.87: log likelihood = -13432.99
## AIC=26881.97 AICc=26882.01 BIC=26931.66
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.02625732 9.292172 6.978032 -0.4578907 5.854358 1.249609
## ACF1
## Training set 0.00173451
df <- df %>%
mutate(
energy_per_device = total_energy / ifelse(occupancy == 0, NA, occupancy), # hindari pembagian 0
high_epd = energy_per_device > quantile(energy_per_device, 0.99, na.rm = TRUE),
low_occ = occupancy < quantile(occupancy, 0.5, na.rm = TRUE)
)
mismatch_cases <- df %>%
filter(high_epd & low_occ)
cat("Number of mismatch cases (high energy per device & occupancy below median):",
nrow(mismatch_cases), "\n")
## Number of mismatch cases (high energy per device & occupancy below median): 36
print(mismatch_cases)
## # A tibble: 36 × 9
## Timestamp occupancy library1 library2 library3 total_energy
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-02-01 05:40:00 2 14 13 49 76
## 2 2020-02-01 05:50:00 2 14 14 46 74
## 3 2020-02-01 06:50:00 2 14 11 51 76
## 4 2020-02-01 07:00:00 2.07 13 11 44 68
## 5 2020-02-01 07:30:00 2.14 13 12 50 75
## 6 2020-02-01 08:00:00 2.71 12 10 76 98
## 7 2020-02-01 08:10:00 2.71 13 11 71 95
## 8 2020-02-01 08:20:00 2.93 12 9 82 103
## 9 2020-02-02 03:50:00 2.71 16 23 71 110
## 10 2020-02-02 05:50:00 2.5 15 26 46 87
## # ℹ 26 more rows
## # ℹ 3 more variables: energy_per_device <dbl>, high_epd <lgl>, low_occ <lgl>
[dihitung dengan data yang terdapat bulan januari-mei]
df$day_of_week <- weekdays(as.Date(df$Timestamp))
# WEEKDAYS DATASET
df_weekdays <- df %>%
filter(!(day_of_week %in% c("Saturday", "Sunday")))
# WEEKEND DATASET
df_weekends <- df %>%
filter(day_of_week %in% c("Saturday", "Sunday"))
total_weekday <- nrow(df_weekdays)
total_weekend <- nrow(df_weekends)
cat("Total Weekday data:", total_weekday, "\n")
## Total Weekday data: 13392
cat("Total Weekend data:", total_weekend, "\n")
## Total Weekend data: 5472
[Data menyesuaikan occupancy]
df_occ$day_of_week <- weekdays(as.Date(df_occ$Timestamp))
# WEEKDAYS DATASET
df_weekdays_occ <- df_occ %>%
filter(!(day_of_week %in% c("Saturday", "Sunday")))
# WEEKEND DATASET
df_weekends_occ <- df_occ %>%
filter(day_of_week %in% c("Saturday", "Sunday"))
total_weekday_occ <- nrow(df_weekdays_occ)
total_weekend_occ <- nrow(df_weekends_occ)
cat("Total Weekday data (menyesuaikan):", total_weekday_occ, "\n")
## Total Weekday data (menyesuaikan): 2531
cat("Total Weekend data (menyesuaikan):", total_weekend_occ, "\n")
## Total Weekend data (menyesuaikan): 1152
Reproduce the previous visualizations for both categories a) Average daily occupancy profile (24h) for weekday vs weekend
df_occ <- df_occ %>%
mutate(IsWeekend = ifelse(day_of_week %in% c("Saturday", "Sunday"), "Weekend", "Weekday"),
TimePOSIX = as.POSIXct(format(Timestamp, "%H:%M:%S"), format = "%H:%M:%S"))
ggplot(df_occ, aes(x = TimePOSIX, y = occupancy,
group = IsWeekend, color = IsWeekend)) +
stat_summary(fun = mean, geom = "line", size = 1.2) +
scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hour") +
scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) +
labs(title = "Weekday vs Weekend - Average Daily Occupancy (24h)",
x = "Jam", y = "Rata-rata Occupancy", color = "") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
b) Daily profiles
df_occ <- df_occ %>%
mutate(hour = as.integer(format(Timestamp, "%H")),
category = ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"))
ggplot(df_occ, aes(x = hour, y = total_energy,
group = as.Date(Timestamp), color = category)) +
geom_line(alpha = 0.3) +
labs(title = "Daily Profiles: Weekday vs Weekend",
x = "Hour of Day", y = "Total Energy") +
scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) +
theme_minimal()
c) Rata-rata energi Weekday vs Weekend
df_occ <- df_occ %>%
mutate(hour = as.integer(format(Timestamp, "%H")),
category = ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"))
ggplot(df_occ, aes(x = hour, y = total_energy, color = category)) +
stat_summary(fun = mean, geom = "line", size = 1.2) +
scale_x_continuous(breaks = seq(0, 23, by = 2)) +
labs(title = "Weekday vs Weekend - Rata-rata Profil Energi Harian",
x = "Jam", y = "Rata-rata Total Energy", color = "") +
scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) +
theme_minimal()
d) Time series plot (Total Energy & Occupancy)
df_occ <- df_occ %>%
mutate(category = ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"))
ggplot(df_occ, aes(x = Timestamp)) +
geom_line(aes(y = total_energy, color = "Total Energy")) +
geom_line(aes(y = occupancy, color = "Occupancy")) +
facet_wrap(~category, scales = "free_x", ncol = 1) +
labs(title = "Time Series: Weekday vs Weekend",
x = "Time", y = "Value", color = "") +
scale_color_manual(values = c("Total Energy" = "blue", "Occupancy" = "orange")) +
theme_minimal()
e) Scatter plot Occupancy vs Energy (Weekday vs Weekend)
df_occ <- df_occ %>%
mutate(category = factor(ifelse(IsWeekend == "Weekend", "Weekend", "Weekday"),
levels = c("Weekday", "Weekend")))
ggplot(df_occ, aes(x = occupancy, y = total_energy, color = category)) + geom_point(alpha = 0.4) + labs(title = "Scatter Plot: Occupancy vs Energy (Weekday vs Weekend)", x = "Occupancy", y = "Total Energy", color = "Category") + scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "orange")) + theme_minimal()
f) Regresi sederhana (weekday vs weekend)
data_weekday <- df_occ %>%
filter(category == "Weekday")
data_weekend <- df_occ %>%
filter(category == "Weekend")
# Weekday regression
model_wd <- lm(total_energy ~ occupancy, data = data_weekday)
cat("Weekday Regression Summary:\n")
## Weekday Regression Summary:
summary(model_wd)
##
## Call:
## lm(formula = total_energy ~ occupancy, data = data_weekday)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.80 -14.91 1.05 15.99 50.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.043e+02 6.178e-01 168.9 <2e-16 ***
## occupancy 4.422e-01 4.631e-03 95.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.83 on 2529 degrees of freedom
## Multiple R-squared: 0.7829, Adjusted R-squared: 0.7828
## F-statistic: 9120 on 1 and 2529 DF, p-value: < 2.2e-16
# Weekend regression
model_we <- lm(total_energy ~ occupancy, data = data_weekend)
cat("\nWeekend Regression Summary:\n")
##
## Weekend Regression Summary:
summary(model_we)
##
## Call:
## lm(formula = total_energy ~ occupancy, data = data_weekend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.789 -11.249 0.603 11.546 49.059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.824346 0.719133 133.25 <2e-16 ***
## occupancy 0.682753 0.009639 70.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.99 on 1150 degrees of freedom
## Multiple R-squared: 0.8135, Adjusted R-squared: 0.8134
## F-statistic: 5017 on 1 and 1150 DF, p-value: < 2.2e-16