Import Library
Load Dataset
wifi <- read.csv("D:/Semester 5/Data Mining & Visualization/file/wifi.csv", header = TRUE)
library1 <- read.csv("D:/Semester 5/Data Mining & Visualization/file/library1.csv", header = TRUE)
library2 <- read.csv("D:/Semester 5/Data Mining & Visualization/file/library2.csv", header = TRUE)
library3 <- read.csv("D:/Semester 5/Data Mining & Visualization/file/library3.csv", header = TRUE)
Exploratory Data
head(wifi)
## time Event.Time Associated.Client.Count
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 184
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 6
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 18
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 23
## 5 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 45
## 6 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 16
## Authenticated.Client.Count Uni Building
## 1 182 Lancaster University Graduate College
## 2 6 Lancaster University Management School
## 3 18 Lancaster University SW hse 32-33
## 4 23 Lancaster University SW hse 29
## 5 45 Lancaster University Furness outer
## 6 16 Lancaster University Slaidburn House (LUSU)
## Floor
## 1 A Floor
## 2 C Floor
## 3 D Floor
## 4 B floor
## 5 C floor
## 6 E floor
head(library1)
## ts name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh 1489442 NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh 1489449 7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh 1489456 7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh 1489464 8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh 1489471 7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh 1489479 8
head(library2)
## ts name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016 KWh 2129016 NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034 KWh 2129034 18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054 KWh 2129054 20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071 KWh 2129071 17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086 KWh 2129086 15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103 KWh 2129103 17
head(library3)
## ts name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209 KWh 6914209 NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266 KWh 6914266 57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310 KWh 6914310 44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376 KWh 6914376 66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439 KWh 6914439 63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474 KWh 6914474 35
print(dim(wifi))
## [1] 1883844 7
print(dim(library1))
## [1] 18864 6
print(dim(library2))
## [1] 18864 6
print(dim(library3))
## [1] 18864 6
str(wifi)
## 'data.frame': 1883844 obs. of 7 variables:
## $ time : chr "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
## $ Event.Time : chr "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
## $ Associated.Client.Count : int 184 6 18 23 45 16 32 6 73 5 ...
## $ Authenticated.Client.Count: int 182 6 18 23 45 16 32 6 73 5 ...
## $ Uni : chr "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
## $ Building : chr " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
## $ Floor : chr " A Floor" " C Floor" " D Floor" " B floor" ...
str(library1)
## 'data.frame': 18864 obs. of 6 variables:
## $ ts : chr "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
## $ name : chr "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" ...
## $ reading : num 1489442 1489449 1489456 1489464 1489471 ...
## $ units : chr "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num 1489442 1489449 1489456 1489464 1489471 ...
## $ rate : num NA 7 7 8 7 8 7 8 7 8 ...
str(library2)
## 'data.frame': 18864 obs. of 6 variables:
## $ ts : chr "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
## $ name : chr "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" ...
## $ reading : num 2129016 2129034 2129054 2129071 2129086 ...
## $ units : chr "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num 2129016 2129034 2129054 2129071 2129086 ...
## $ rate : num NA 18 20 17 15 17 17 17 17 17 ...
str(library3)
## 'data.frame': 18864 obs. of 6 variables:
## $ ts : chr "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
## $ name : chr "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" ...
## $ reading : num 6914209 6914266 6914310 6914376 6914439 ...
## $ units : chr "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num 6914209 6914266 6914310 6914376 6914439 ...
## $ rate : num NA 57 44 66 63 35 47 55 34 41 ...
wifi$time <- ymd_hms(wifi$time)
library1$ts <- ymd_hms(library1$ts)
library2$ts <- ymd_hms(library2$ts)
library3$ts <- ymd_hms(library3$ts)
str(wifi)
## 'data.frame': 1883844 obs. of 7 variables:
## $ time : POSIXct, format: "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
## $ Event.Time : chr "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
## $ Associated.Client.Count : int 184 6 18 23 45 16 32 6 73 5 ...
## $ Authenticated.Client.Count: int 182 6 18 23 45 16 32 6 73 5 ...
## $ Uni : chr "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
## $ Building : chr " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
## $ Floor : chr " A Floor" " C Floor" " D Floor" " B floor" ...
str(library1)
## 'data.frame': 18864 obs. of 6 variables:
## $ ts : POSIXct, format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
## $ name : chr "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" ...
## $ reading : num 1489442 1489449 1489456 1489464 1489471 ...
## $ units : chr "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num 1489442 1489449 1489456 1489464 1489471 ...
## $ rate : num NA 7 7 8 7 8 7 8 7 8 ...
str(library2)
## 'data.frame': 18864 obs. of 6 variables:
## $ ts : POSIXct, format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
## $ name : chr "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" ...
## $ reading : num 2129016 2129034 2129054 2129071 2129086 ...
## $ units : chr "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num 2129016 2129034 2129054 2129071 2129086 ...
## $ rate : num NA 18 20 17 15 17 17 17 17 17 ...
str(library3)
## 'data.frame': 18864 obs. of 6 variables:
## $ ts : POSIXct, format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" ...
## $ name : chr "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" ...
## $ reading : num 6914209 6914266 6914310 6914376 6914439 ...
## $ units : chr "KWh" "KWh" "KWh" "KWh" ...
## $ cumulative: num 6914209 6914266 6914310 6914376 6914439 ...
## $ rate : num NA 57 44 66 63 35 47 55 34 41 ...
unique(wifi$Building)
## [1] " Graduate College " " Management School "
## [3] " SW hse 32-33 " " SW hse 29 "
## [5] " Furness outer " " Slaidburn House (LUSU) "
## [7] " SW hse 34 " " Bowland hall "
## [9] " Fylde " " SW hse 55-56 "
## [11] " SW hse 40-42 " " Bowland Twr (Old Bowland Annexe) "
## [13] " Pendle " " Faraday and cTAP "
## [15] " Institute for Advanced Studies " " University House "
## [17] " SW hse 53-54 " " Engineering "
## [19] " Field Station " " SW hse 36 "
## [21] " Infolab " " SW hse 21-23 "
## [23] " LICA " " FU Hse 71-74 "
## [25] " Grizedale " " Charles Carter "
## [27] " Furness " " SW hse 43-45 "
## [29] " SW hse 12-16 " " Bowland Main "
## [31] " SW hse 35 " " Human Resources "
## [33] " County " " FY Hse 65-70 "
## [35] " Barker House Farm " " SW hse 27-28 "
## [37] " Bowland Annexe " " John Creed "
## [39] " grize-res " " SW hse 24-26 "
## [41] " SW hse 20 " " Ruskin Library "
## [43] " County South/Cartmel " " Library "
## [45] " Conference Centre " " Postgrad Stats (PSC) "
## [47] " Bowland North " " George Fox "
## [49] " Hse 75 77 " " Bailrigg House "
## [51] " Sports Centre " " SW hse 30-31 "
## [53] " Bowland Ash " " Alex Square "
## [55] " LEC " " MDC "
## [57] " ISS Building " " Chaplaincy Centre "
## [59] " CETAD " " SW hse 50-52 "
## [61] " SW hse 17-19 " " Central Workshops "
## [63] " SW hse 46-49 " " SW hse 39 "
## [65] " Science and technology " " Whewell "
## [67] " SW hse 37-38 " " Lonsdale College (SW) "
## [69] " Physics " " County Field "
## [71] " Great Hall " " SW hse-158-179 "
## [73] " Preschool " " Reception "
## [75] " Hazelrigg " " Energy Centre "
wifi_library <- subset(wifi, Building == " Library " )
head(wifi_library)
## time Event.Time Associated.Client.Count
## 74 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 29
## 158 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 0
## 235 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 21
## 260 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020 38
## 291 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020 0
## 497 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020 33
## Authenticated.Client.Count Uni Building Floor
## 74 28 Lancaster University Library A floor
## 158 0 Lancaster University Library LG floor
## 235 20 Lancaster University Library C floor
## 260 37 Lancaster University Library B floor
## 291 0 Lancaster University Library LG floor
## 497 31 Lancaster University Library A floor
colSums(is.na(wifi_library))
## time Event.Time
## 0 0
## Associated.Client.Count Authenticated.Client.Count
## 0 0
## Uni Building
## 0 0
## Floor
## 0
wifi_library_10min <- wifi_library %>%
mutate(time = floor_date(time, "10 minutes")) %>%
group_by(time) %>%
summarise(mean_count = mean(Associated.Client.Count, na.rm = TRUE)) %>%
ungroup()
print(wifi_library_10min)
## # A tibble: 3,683 × 2
## time mean_count
## <dttm> <dbl>
## 1 2020-02-01 00:00:00 22.8
## 2 2020-02-01 00:10:00 20.6
## 3 2020-02-01 00:20:00 17.2
## 4 2020-02-01 00:30:00 15.4
## 5 2020-02-01 00:40:00 12.4
## 6 2020-02-01 00:50:00 10.5
## 7 2020-02-01 01:00:00 9.25
## 8 2020-02-01 01:10:00 8.67
## 9 2020-02-01 01:20:00 8.5
## 10 2020-02-01 01:30:00 8.38
## # ℹ 3,673 more rows
# Data yang duplikat
wifi_library_10min[duplicated(wifi_library_10min$time), ]
## # A tibble: 0 × 2
## # ℹ 2 variables: time <dttm>, mean_count <dbl>
library1[duplicated(library1$ts), ]
## [1] ts name reading units cumulative rate
## <0 rows> (or 0-length row.names)
library2[duplicated(library2$ts), ]
## [1] ts name reading units cumulative rate
## <0 rows> (or 0-length row.names)
library3[duplicated(library3$ts), ]
## [1] ts name reading units cumulative rate
## <0 rows> (or 0-length row.names)
# Cek data kosong (NA) di setiap kolom
colSums(is.na(wifi_library_10min))
## time mean_count
## 0 0
colSums(is.na(library1))
## ts name reading units cumulative rate
## 0 0 3041 0 3041 3047
colSums(is.na(library2))
## ts name reading units cumulative rate
## 0 0 3041 0 3041 3047
colSums(is.na(library3))
## ts name reading units cumulative rate
## 0 0 3041 0 3041 3047
library1$rate[is.na(library1$rate)] <- mean(library1$rate[1:144], na.rm = TRUE)
library2$rate[is.na(library2$rate)] <- mean(library2$rate[1:144], na.rm = TRUE)
library3$rate[is.na(library3$rate)] <- mean(library3$rate[1:144], na.rm = TRUE)
Data Integration
# Data Integration
energy <- merge(library1, library2, by = "ts")
energy <- merge(energy, library3, by = "ts")
head(energy)
## ts name.x reading.x units.x cumulative.x rate.x
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh 1489442 7.384615
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh 1489449 7.000000
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh 1489456 7.000000
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh 1489464 8.000000
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh 1489471 7.000000
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh 1489479 8.000000
## name.y reading.y units.y cumulative.y rate.y name
## 1 MC065-L01/M11R2056 2129016 KWh 2129016 17.87413 MC065-L01/M13R2064
## 2 MC065-L01/M11R2056 2129034 KWh 2129034 18.00000 MC065-L01/M13R2064
## 3 MC065-L01/M11R2056 2129054 KWh 2129054 20.00000 MC065-L01/M13R2064
## 4 MC065-L01/M11R2056 2129071 KWh 2129071 17.00000 MC065-L01/M13R2064
## 5 MC065-L01/M11R2056 2129086 KWh 2129086 15.00000 MC065-L01/M13R2064
## 6 MC065-L01/M11R2056 2129103 KWh 2129103 17.00000 MC065-L01/M13R2064
## reading units cumulative rate
## 1 6914209 KWh 6914209 39.55944
## 2 6914266 KWh 6914266 57.00000
## 3 6914310 KWh 6914310 44.00000
## 4 6914376 KWh 6914376 66.00000
## 5 6914439 KWh 6914439 63.00000
## 6 6914474 KWh 6914474 35.00000
energy$total_rate <- rowSums(energy[, c("rate.x", "rate.y", "rate")], na.rm = TRUE)
energy <- energy[, c("ts", "total_rate")]
energy$ts <- as.POSIXct(energy$ts)
head(energy)
## ts total_rate
## 1 2020-01-01 00:00:00 64.81818
## 2 2020-01-01 00:10:00 82.00000
## 3 2020-01-01 00:20:00 71.00000
## 4 2020-01-01 00:30:00 91.00000
## 5 2020-01-01 00:40:00 85.00000
## 6 2020-01-01 00:50:00 60.00000
ggplot() +
geom_line(data = wifi_library_10min, aes(x = time, y = mean_count, color = "Occupancy (WiFi)"), alpha = 0.7) +
geom_line(data = energy, aes(x = ts, y = total_rate, color = "Energy Consumption (kWh)"), alpha = 0.7) +
labs(title = "Time Series of Library Occupancy vs Energy Consumption",
x = "Time", y = "Count / kWh", color = "") +
theme_minimal() +
theme(legend.position = "bottom")
# Mencari Min Max Occupancy
min_time <- min(wifi_library_10min$time, na.rm = TRUE)
max_time <- max(wifi_library_10min$time, na.rm = TRUE)
#Filter data energy pada rentang min_time - max_time
energy_filtered <- subset(energy, ts >= min_time & ts <= max_time)
head(energy_filtered)
## ts total_rate
## 4465 2020-02-01 00:00:00 64.81818
## 4466 2020-02-01 00:10:00 97.00000
## 4467 2020-02-01 00:20:00 96.00000
## 4468 2020-02-01 00:30:00 100.00000
## 4469 2020-02-01 00:40:00 89.00000
## 4470 2020-02-01 00:50:00 91.00000
#Data Integration antara wifi dan energy
df <- data.frame(
time = wifi_library_10min$time,
occupancy = wifi_library_10min$mean_count,
rate = energy_filtered$total_rate
)
df <- na.omit(df)
head(df)
## time occupancy rate
## 1 2020-02-01 00:00:00 22.750 64.81818
## 2 2020-02-01 00:10:00 20.625 97.00000
## 3 2020-02-01 00:20:00 17.250 96.00000
## 4 2020-02-01 00:30:00 15.375 100.00000
## 5 2020-02-01 00:40:00 12.375 89.00000
## 6 2020-02-01 00:50:00 10.500 91.00000
Data Visualization and Analysis
ggplot(df, aes(x = time)) +
geom_line(aes(y = occupancy, color = "Occupancy (WiFi)"), alpha = 0.7) +
geom_line(aes(y = rate, color = "Energy Consumption (kWh)"), alpha = 0.7) +
labs(title = "Time Series of Library Occupancy vs Energy Consumption",
x = "Time", y = "Count / kWh", color = "") +
theme_minimal() +
theme(legend.position = "bottom")
# Scatterplot antara Occupancy vs Rate
ggplot(df, aes(x = occupancy, y = rate)) +
geom_point(color = "blue", shape = 16, size = 1) +
labs(x = "Occupancy", y = "Rate") +
theme_minimal()
# Filter data hanya yang terdapat pada 1 hari full
lastdate <- max(as.Date(df$time))
df <- subset(df, as.Date(time) < lastdate)
df$time <- as.POSIXct(df$time)
df$hour <- as.numeric(format(df$time, "%H")) + as.numeric(format(df$time, "%M"))/60
df$date <- as.Date(df$time)
df$hour_of_day <- as.POSIXct(df$hour * 3600, origin = "1970-01-01", tz = "UTC")
df$hour_of_day <- lubridate::round_date(df$hour_of_day, "10 minutes")
# rata-rata per jam across days
df_daily <- df %>%
group_by(hour) %>%
summarise(
occupancy = mean(occupancy, na.rm = TRUE),
rate = mean(rate, na.rm = TRUE)
) %>%
ungroup()
df_daily$hour_of_day <- as.POSIXct(df_daily$hour * 3600, origin = "1970-01-01", tz = "UTC")
df_daily$hour_of_day <- lubridate::round_date(df_daily$hour_of_day, "10 minutes")
df_daily
## # A tibble: 144 × 4
## hour occupancy rate hour_of_day
## <dbl> <dbl> <dbl> <dttm>
## 1 0 48.7 128. 1970-01-01 00:00:00
## 2 0.167 44.0 127. 1970-01-01 00:10:00
## 3 0.333 40.4 123. 1970-01-01 00:20:00
## 4 0.5 37.8 124 1970-01-01 00:30:00
## 5 0.667 34.9 124. 1970-01-01 00:40:00
## 6 0.833 31.9 121. 1970-01-01 00:50:00
## 7 1 29.7 121. 1970-01-01 01:00:00
## 8 1.17 27.1 116. 1970-01-01 01:10:00
## 9 1.33 25.9 113. 1970-01-01 01:20:00
## 10 1.5 24.3 111. 1970-01-01 01:30:00
## # ℹ 134 more rows
# Occupancy profile
p1 <- ggplot(df, aes(x = hour_of_day, y = occupancy, group = date)) +
geom_line(color = "blue", alpha = 0.2) +
geom_line(data = df_daily, aes(x = hour_of_day, y = occupancy, group = 1),
color = "red", size = 1) +
scale_x_datetime(date_labels = "%H:%M:%S") +
labs(title = "Daily Profiles of Occupancy (01 Feb 2020 - 25 Feb 2020)",
y = "Occupancy", x = NULL) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Rate profile
p2 <- ggplot(df, aes(x = hour_of_day, y = rate, group = date)) +
geom_line(color = "green", alpha = 0.2) +
geom_line(data = df_daily, aes(x = hour_of_day, y = rate, group = 1),
color = "orange", size = 1) +
scale_x_datetime(date_labels = "%H:%M:%S") +
labs(title = "Daily Profiles of Rate (01 Feb 2020 - 25 Feb 2020)",
y = "Rate", x = "Hour of Day") +
theme_minimal()
# Gabung dua plot
p1 / p2
# Separate data Weekdays dan Weekend
df$day_of_week <- as.integer(format(df$time, "%w")) # 0 = Minggu, 6 = Sabtu
df$weekend <- df$day_of_week >= 6 | df$day_of_week == 0
head(df)
## time occupancy rate hour date
## 1 2020-02-01 00:00:00 22.750 64.81818 0.0000000 2020-02-01
## 2 2020-02-01 00:10:00 20.625 97.00000 0.1666667 2020-02-01
## 3 2020-02-01 00:20:00 17.250 96.00000 0.3333333 2020-02-01
## 4 2020-02-01 00:30:00 15.375 100.00000 0.5000000 2020-02-01
## 5 2020-02-01 00:40:00 12.375 89.00000 0.6666667 2020-02-01
## 6 2020-02-01 00:50:00 10.500 91.00000 0.8333333 2020-02-01
## hour_of_day day_of_week weekend
## 1 1970-01-01 00:00:00 6 TRUE
## 2 1970-01-01 00:10:00 6 TRUE
## 3 1970-01-01 00:20:00 6 TRUE
## 4 1970-01-01 00:30:00 6 TRUE
## 5 1970-01-01 00:40:00 6 TRUE
## 6 1970-01-01 00:50:00 6 TRUE
# Rata-rata weekday
df_weekday <- df %>%
filter(!weekend) %>%
group_by(hour) %>%
summarise(
occupancy = mean(occupancy, na.rm = TRUE),
rate = mean(rate, na.rm = TRUE)
) %>%
ungroup()
df_weekday$hour_of_day <- as.POSIXct(df_weekday$hour * 3600, origin = "1970-01-01", tz = "UTC")
df_weekday$hour_of_day <- lubridate::round_date(df_weekday$hour_of_day, "10 minutes")
# Rata-rata weekend
df_weekend <- df %>%
filter(weekend) %>%
group_by(hour) %>%
summarise(
occupancy = mean(occupancy, na.rm = TRUE),
rate = mean(rate, na.rm = TRUE)
) %>%
ungroup()
df_weekend$hour_of_day <- as.POSIXct(df_weekend$hour * 3600, origin = "1970-01-01", tz = "UTC")
df_weekend$hour_of_day <- lubridate::round_date(df_weekend$hour_of_day, "10 minutes")
head(df_weekend)
## # A tibble: 6 × 4
## hour occupancy rate hour_of_day
## <dbl> <dbl> <dbl> <dttm>
## 1 0 31.1 109. 1970-01-01 00:00:00
## 2 0.167 28.4 112. 1970-01-01 00:10:00
## 3 0.333 25.4 109. 1970-01-01 00:20:00
## 4 0.5 23.8 111. 1970-01-01 00:30:00
## 5 0.667 21.6 109. 1970-01-01 00:40:00
## 6 0.833 19.7 115. 1970-01-01 00:50:00
head(df_weekday)
## # A tibble: 6 × 4
## hour occupancy rate hour_of_day
## <dbl> <dbl> <dbl> <dttm>
## 1 0 56.9 137. 1970-01-01 00:00:00
## 2 0.167 51.3 134. 1970-01-01 00:10:00
## 3 0.333 47.4 129. 1970-01-01 00:20:00
## 4 0.5 44.3 130. 1970-01-01 00:30:00
## 5 0.667 41.1 131. 1970-01-01 00:40:00
## 6 0.833 37.6 124. 1970-01-01 00:50:00
#Visualisasi Weekday vs Weekend
# Occupancy
p_occ <- ggplot() +
geom_line(data = df_weekday, aes(x = hour_of_day, y = occupancy, color = "Weekday"), size = 1) +
geom_line(data = df_weekend, aes(x = hour_of_day, y = occupancy, color = "Weekend"), size = 1) +
scale_x_datetime(date_labels = "%H:%M:%S") +
scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "red")) +
labs(title = "Weekday vs Weekend - Occupancy", y = "Occupancy", x = NULL, color = NULL) +
theme_minimal()
# Rate
p_rate <- ggplot() +
geom_line(data = df_weekday, aes(x = hour_of_day, y = rate, color = "Weekday"), size = 1) +
geom_line(data = df_weekend, aes(x = hour_of_day, y = rate, color = "Weekend"), size = 1) +
scale_x_datetime(date_labels = "%H:%M:%S") +
scale_color_manual(values = c("Weekday" = "blue", "Weekend" = "red")) +
labs(title = "Weekday vs Weekend - Rate", y = "Rate", x = "Hour of Day", color = NULL) +
theme_minimal()
p_occ / p_rate
# Identity Peak Hour
df$hour <- as.integer(format(df$time, "%H"))
occupancy_by_hour <- df %>%
group_by(hour) %>%
summarise(occupancy = mean(occupancy, na.rm = TRUE)) %>%
ungroup()
peak_hours <- occupancy_by_hour %>%
filter(occupancy == max(occupancy, na.rm = TRUE))
print(peak_hours)
## # A tibble: 1 × 2
## hour occupancy
## <int> <dbl>
## 1 15 370.
ggplot(occupancy_by_hour, aes(x = factor(hour), y = occupancy)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Barplot Occupancy per Hour", x = "Hour", y = "Occupancy") +
theme_minimal() +
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
#Examine whether occupancy significantly influences energy
correlation <- cor(df$occupancy, df$rate, use = "complete.obs")
print(paste("Korelasi Occupancy vs Energy Consumption (Rate) :", correlation))
## [1] "Korelasi Occupancy vs Energy Consumption (Rate) : 0.877768630777599"
#Highlight cases where energy consumption does not align with occupancy.
df <- df %>%
mutate(
z_rate = scale(rate),
z_occupancy = scale(occupancy),
anomaly = (abs(z_rate) > 2) & (abs(z_occupancy) < 2)
)
#Visualisasi scatterplot dan mewarnai berbeda yang anomali
ggplot() +
geom_point(data = subset(df, !anomaly), aes(x = occupancy, y = rate), color = "blue", alpha = 0.7) +
geom_point(data = subset(df, anomaly), aes(x = occupancy, y = rate), color = "red", alpha = 0.7) +
labs(title = "Occupancy vs Rate (Anomaly Highlight)",
x = "Occupancy", y = "Energy Rate") +
theme_minimal()