Nama anggota:
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tidyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
# membaca data csv
wifi<-read_csv("C://Users//khoir//OneDrive - Institut Teknologi Sepuluh Nopember//Documents//1. SMT 5//DATA MINING//wifi.csv")
## Rows: 1883844 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Event Time, Uni, Building, Floor
## dbl (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
wifi
# Menampilkan data lengkap library1
lib1<-read_csv("C://Users//khoir//OneDrive - Institut Teknologi Sepuluh Nopember//Documents//1. SMT 5//DATA MINING//library1.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib1
# Menampilkan data lengkap library2
lib2<-read_csv("C://Users//khoir//OneDrive - Institut Teknologi Sepuluh Nopember//Documents//1. SMT 5//DATA MINING//library2.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib2
# Menampilkan data lengkap library3
lib3<-read_csv("C://Users//khoir//OneDrive - Institut Teknologi Sepuluh Nopember//Documents//1. SMT 5//DATA MINING//library3.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib3
# Memfilter mengambil data wifi dengan ketentuan place hanya "Library"
wifi_clean<-wifi %>%
filter(Building == "Library")
wifi_clean
# Meresample data wifi clean ke interval 10 menit
wifi_resampled <- wifi_clean %>%
mutate(time = floor_date(time, "10 minutes")) %>%
group_by(time) %>%
summarise(mean_client = mean(`Associated Client Count`, na.rm = TRUE),
.groups = "drop")
wifi_resampled
energy_raw <- bind_rows(lib1, lib2, lib3)
energy_raw
DATA CLEANING
# Mengubah tipe waktu dan Menghapus duplikat pada dataset library (jika ada)
lib1 <- lib1 %>%
mutate(ts = as_datetime(ts, tz="UTC")) %>%
distinct()
lib2 <- lib2 %>%
mutate(ts = as_datetime(ts, tz="UTC")) %>%
distinct()
lib3 <- lib3 %>%
mutate(ts = as_datetime(ts, tz="UTC")) %>%
distinct()
print(lib1)
## # A tibble: 18,864 × 6
## ts name reading units cumulative rate
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh 1489442 NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh 1489449 7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh 1489456 7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh 1489464 8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh 1489471 7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh 1489479 8
## 7 2020-01-01 01:00:00 MC065-L01/M9R2048 1489486 KWh 1489486 7
## 8 2020-01-01 01:10:00 MC065-L01/M9R2048 1489494 KWh 1489494 8
## 9 2020-01-01 01:20:00 MC065-L01/M9R2048 1489501 KWh 1489501 7
## 10 2020-01-01 01:30:00 MC065-L01/M9R2048 1489509 KWh 1489509 8
## # ℹ 18,854 more rows
print(lib2)
## # A tibble: 18,864 × 6
## ts name reading units cumulative rate
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh 2129016 NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh 2129034 18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh 2129054 20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh 2129071 17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh 2129086 15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh 2129103 17
## 7 2020-01-01 01:00:00 MC065-L01/M11R2056 2129120 KWh 2129120 17
## 8 2020-01-01 01:10:00 MC065-L01/M11R2056 2129137 KWh 2129137 17
## 9 2020-01-01 01:20:00 MC065-L01/M11R2056 2129154 KWh 2129154 17
## 10 2020-01-01 01:30:00 MC065-L01/M11R2056 2129171 KWh 2129171 17
## # ℹ 18,854 more rows
print(lib3)
## # A tibble: 18,864 × 6
## ts name reading units cumulative rate
## <dttm> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh 6914209 NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh 6914266 57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh 6914310 44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh 6914376 66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh 6914439 63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh 6914474 35
## 7 2020-01-01 01:00:00 MC065-L01/M13R2064 6914521 KWh 6914521 47
## 8 2020-01-01 01:10:00 MC065-L01/M13R2064 6914576 KWh 6914576 55
## 9 2020-01-01 01:20:00 MC065-L01/M13R2064 6914610 KWh 6914610 34
## 10 2020-01-01 01:30:00 MC065-L01/M13R2064 6914651 KWh 6914651 41
## # ℹ 18,854 more rows
# Fungsi untuk mengisi nilai MV dengan mean dari 144 obs pertama/teratas
fill_missing_avg <- function(df){
baseline <- mean(df$rate[1:144], na.rm = TRUE)
df %>%
mutate(rate = ifelse(is.na(rate), baseline, rate)
)
}
# cek missing values sebelum cleaning
cat("Missing Values BEFORE cleaning:\n")
## Missing Values BEFORE cleaning:
sapply(
list(lib1=lib1, lib2=lib2, lib3=lib3),
function(df) sum(is.na(df$rate))
)
## lib1 lib2 lib3
## 3047 3047 3047
# apply ke masing-masing dataset
library1_clean <- fill_missing_avg(lib1)
library2_clean <- fill_missing_avg(lib2)
library3_clean <- fill_missing_avg(lib3)
# cek missing values setelah cleaning
cat("Missing values AFTER cleaning:\n")
## Missing values AFTER cleaning:
sapply(
list(library1_clean=library1_clean,
library2_clean=library2_clean,
library3_clean=library3_clean),
function(df) sum(is.na(df$rate))
)
## library1_clean library2_clean library3_clean
## 0 0 0
# Menggabung library yang sudah clean
energy_clean <- bind_rows(library1_clean, library2_clean, library3_clean)
# Pastikan ts datetime
energy_clean <- energy_clean %>%
mutate(ts = as_datetime(ts, tz="UTC"))
# Memastikan data dimulai dari waktu 00:00:00
ts_start <- as.POSIXct(paste0(as.Date(min(energy_clean$ts)), " 00:00:00"), tz="UTC")
ts_end <- max(energy_clean$ts, na.rm=TRUE)
# Resampling 10 menit
energy_resampled <- energy_clean %>%
mutate(ts_10min = floor_date(ts, "10 minutes")) %>%
group_by(ts_10min) %>%
summarise(rate = sum(rate, na.rm=TRUE), .groups="drop") %>%
complete(ts_10min = seq(ts_start, ts_end, by="10 min"),
fill = list(rate = 0)) %>%
arrange(ts_10min)
# Cek hasilnya
cat("Resampled electricity data shape:\n")
## Resampled electricity data shape:
print(dim(energy_resampled))
## [1] 18864 2
head(energy_resampled)
# Merge data occupancy dan energy
occupancy_energy <- wifi_resampled %>%
rename(occupancy = mean_client) %>%
inner_join(energy_resampled %>% rename (energy = rate),
by = c("time"="ts_10min"))
head(occupancy_energy)
# time series plot occupancy dan energi
ggplot(occupancy_energy, aes(x=time))+
geom_line(aes(y = occupancy, color= "Occupancy"))+
geom_line(aes(y = energy, color= "Energy"))+
scale_y_continuous(
name = "Occupancy",
sec.axis = sec_axis(~., name = "Energy")
)+
labs(title = "Time Series: Occupancy & Energy Consumption", x = "Time")+
scale_color_manual(values = c("Occupancy" = "blue", "Energy" = "orange"))+ theme_minimal()
#Scatter Plot
ggplot(occupancy_energy, aes(x = occupancy, y= energy))+
geom_point(alpha=0.4, color = "blue")+
geom_smooth(method="lm", color="salmon")+
labs(title= "Scatter Plot : Occupancy vs Energy",
x = "Occupancy (clients)",
y = "Energy Consumption (rate)")+
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Daily Profiles (24H)
occupancy_energy <- occupancy_energy %>%
mutate(hour = hour(time),
date = as.Date(time))
# Daily Profiles: Satu garis per hari sesuai perintah
ggplot(occupancy_energy, aes(x = hour, y = occupancy, group = date))+
geom_line(alpha = 0.3, color = "darkgreen")+
stat_summary(aes(group=1), fun = mean, geom = "line", color = "purple", size = 1.2)+
labs(title = "Daily Occupancy Profiles with Average (rata)",
x = "Hour of Day",
y = "Occupancy") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Mengidentifikasi Peak Hours
occupancy_energy %>%
group_by(hour) %>%
summarise(avg_occupancy = mean(occupancy),
avg_energy = mean(energy)) %>%
arrange(desc(avg_occupancy))
# Mengetahui apakah occupancy secara signifikan berpengaruh terhadap konsumsi listrik (energi)
cor(occupancy_energy$occupancy, occupancy_energy$energy)
## [1] 0.8782117
# Highlight 10 kasus energy tidak sesuai occupancy
top_residuals <- occupancy_energy %>%
# hitung residual langsung dari model linear sementara
mutate(residual = energy - predict(lm(energy ~ occupancy, data = .))) %>%
arrange(desc(abs(residual))) %>%
slice_head(n = 10) # 10 kasus ekstrem di data
top_residuals
# Weekends Vs Weekday Comparison
# Memisahkan dataset weekdays (senin-jumat) dan weekend (sabtu-minggu)
occupancy_energy <- occupancy_energy %>%
mutate(day_type = ifelse(wday(date) %in% c(1,7), "Weekend", "Weekday"))
occupancy_energy
# Occupancy Time Series antara weekday dan weekend
ggplot(occupancy_energy, aes(x = time, y = occupancy, color = day_type)) +
geom_line(alpha = 0.6) +
labs(title = "Occupancy Over Time: Weekday vs Weekend",
x = "Time", y = "Occupancy") +
theme_minimal()
# Energy Time Series antara weekday dengan weekend
ggplot(occupancy_energy, aes(x = time, y = energy, color = day_type)) +
geom_line(alpha = 0.6) +
labs(title = "Energy Consumption Over Time: Weekday vs Weekend",
x = "Time", y = "Energy") +
theme_minimal()
#Scatter plot (occupancy vs energy)
ggplot(occupancy_energy, aes(x = occupancy, y = energy, color = day_type)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, size = 1) +
labs(title = "Occupancy vs Energy: Weekday vs Weekend",
x = "Occupancy", y = "Energy") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Daily profile per day (24h)
ggplot(occupancy_energy, aes(x = hour, y = occupancy, group = date, color = day_type)) +
geom_line(alpha = 0.3) +
stat_summary(aes(group=day_type), fun = mean, geom = "line", size = 1.2) +
labs(title = "Daily Occupancy Profiles: Weekday vs Weekend",
x = "Hour of Day",
y = "Occupancy") +
theme_minimal()