Load dataset
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
wifi <- read_csv("D:/wifi.csv")
## Rows: 1883844 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Event Time, Uni, Building, Floor
## dbl (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib1 <- read_csv("D:/library1.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib2 <- read_csv("D:/library2.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lib3 <- read_csv("D:/library3.csv")
## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, units
## dbl (3): reading, cumulative, rate
## dttm (1): ts
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Cek Missing Value dari dataset library
sum(is.na(ymd_hms(lib1$ts)))
## Warning: 131 failed to parse.
## [1] 131
sum(is.na(ymd_hms(lib2$ts)))
## Warning: 131 failed to parse.
## [1] 131
sum(is.na(ymd_hms(lib3$ts)))
## Warning: 131 failed to parse.
## [1] 131
Integrasi variabel ts
library(lubridate)
wifi <- wifi %>%
mutate(time = parse_date_time(time, orders = c('ymd HMS', 'ymd HM', 'ymd')))
lib1 <- lib1 %>%
mutate(ts = parse_date_time(ts, orders = c("ymd HMS", "ymd HM", "ymd")))
lib2 <- lib2 %>%
mutate(ts = parse_date_time(ts, orders = c("ymd HMS", "ymd HM", "ymd")))
lib3 <- lib3 %>%
mutate(ts = parse_date_time(ts, orders = c('ymd HMS', 'ymd HM', 'ymd')))
#------Cek Nilai ts yang missing------
sum(is.na(lib1$ts))
## [1] 0
sum(is.na(lib2$ts))
## [1] 0
sum(is.na(lib3$ts))
## [1] 0
sum(is.na(wifi$time))
## [1] 0
Cek Persentase Missing Value
#--------Plot data-------
library(ggplot2)
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'imputeTS'
## The following object is masked from 'package:zoo':
##
## na.locf
library(naniar)
missing1 <- (sum(is.na(lib1$rate)) / nrow(lib1)) *100
missing1
## [1] 16.15246
missing2 <- (sum(is.na(lib2$rate)) / nrow(lib2)) *100
missing2
## [1] 16.15246
missing3 <- (sum(is.na(lib3$rate)) / nrow(lib3)) *100
missing3
## [1] 16.15246
miss_var_summary(wifi)
Imputasi dengan mean 144 obs pertama
fill_with_mean144 <- function(x) {
mean144 <- mean(head(na.omit(x), 144))
x[is.na(x)] <- mean144
return(x)
}
lib1 <- lib1 %>% mutate(rate = fill_with_mean144(rate))
lib2 <- lib2 %>% mutate(rate = fill_with_mean144(rate))
lib3 <- lib3 %>% mutate(rate = fill_with_mean144(rate))
Hapus Duplikasi Data
wifi <- distinct(wifi)
lib1 <- distinct(lib1)
lib2 <- distinct(lib2)
lib3 <- distinct(lib3)
Filter & Sinkronisasi
Resample WiFi ke 10 menit (mean)
# Filter hanya WiFi di Library
wifi_lib <- wifi %>%
filter(Building == "Library")
# Resample ke 10 menit
wifi_10min <- wifi_lib %>%
mutate(time = floor_date(time, "10 minutes")) %>%
group_by(time) %>%
summarise(occupancy = mean(`Associated Client Count`, na.rm = TRUE))
Resample energi (sum per 10 menit)
lib1_10min <- lib1 %>%
mutate(ts = floor_date(ts, "10 minutes")) %>%
group_by(ts) %>%
summarise(rate1 = sum(rate, na.rm = TRUE))
lib2_10min <- lib2 %>%
mutate(ts = floor_date(ts, "10 minutes")) %>%
group_by(ts) %>%
summarise(rate2 = sum(rate, na.rm = TRUE))
lib3_10min <- lib3 %>%
mutate(ts = floor_date(ts, "10 minutes")) %>%
group_by(ts) %>%
summarise(rate3 = sum(rate, na.rm = TRUE))
Gabung semua rate jadi total
energy_10min <- lib1_10min %>%
inner_join(lib2_10min, by = "ts") %>%
inner_join(lib3_10min, by = "ts") %>%
mutate(total_energy = rate1 + rate2 + rate3)
Gabung dataset wifi dan Rate
data_all <- wifi_10min %>%
inner_join(energy_10min, by = c("time" = "ts"))
Visualisasi
# Tambah variabel hour dan day_type
data_all <- data_all %>%
mutate(hour = hour(time),
day_type = ifelse(wday(time) %in% c(1,7), "Weekend", "Weekday"))
# 1. Time series Occupancy vs Energy
ggplot(data_all, aes(x = time)) +
geom_line(aes(y = occupancy, color = "Occupancy")) +
geom_line(aes(y = total_energy, color = "Energy")) +
labs(title = "Time Series Occupancy vs Energy",
y = "Value", color = "Legend")

# 2. Scatter plot
ggplot(data_all, aes(x = occupancy, y = total_energy)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red") +
labs(title = "Scatter Plot: Occupancy vs Energy",
x = "Occupancy", y = "Energy")
## `geom_smooth()` using formula = 'y ~ x'

# 3. Daily profile (average per hour per day)
data_daily <- data_all %>%
mutate(date = as.Date(time)) %>%
group_by(date, hour) %>%
summarise(occ = mean(occupancy), energy = mean(total_energy), .groups="drop")
ggplot(data_daily, aes(x = hour, y = occ, group = date)) +
geom_line(alpha = 0.3) +
stat_summary(fun = mean, geom = "line", color = "red", size = 1.2) +
labs(title = "Daily Occupancy Profile (red = average)",
x = "Hour of Day", y = "Occupancy")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 4. Peak hours of occupancy (average per hour)
peak_occ <- data_all %>%
group_by(hour) %>%
summarise(mean_occ = mean(occupancy, na.rm = TRUE))
ggplot(peak_occ, aes(x = hour, y = mean_occ)) +
geom_line(color = "blue") +
geom_point() +
labs(title = "Average Occupancy per Hour (All Days)",
x = "Hour of Day", y = "Mean Occupancy")

# 5. Korelasi numerik
cor_test <- cor.test(data_all$occupancy, data_all$total_energy, use = "complete.obs")
print(cor_test)
##
## Pearson's product-moment correlation
##
## data: data_all$occupancy and data_all$total_energy
## t = 111.41, df = 3681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8706080 0.8853958
## sample estimates:
## cor
## 0.8782116
# 6. Highlight anomaly cases
anomalies <- data_all %>%
filter((occupancy < 10 & total_energy > quantile(total_energy, 0.9, na.rm=TRUE)) |
(occupancy > quantile(occupancy, 0.9, na.rm=TRUE) & total_energy < quantile(total_energy, 0.1, na.rm=TRUE)))
ggplot(data_all, aes(x = occupancy, y = total_energy)) +
geom_point(alpha = 0.3) +
geom_point(data = anomalies, aes(x = occupancy, y = total_energy), color = "red", size = 2) +
labs(title = "Anomalies: Energy vs Occupancy")

# 7. Weekend vs Weekday Comparison
## a. Peak Occupancy
peak_by_type <- data_all %>%
group_by(day_type, hour) %>%
summarise(mean_occ = mean(occupancy, na.rm = TRUE), .groups = "drop")
ggplot(peak_by_type, aes(x = hour, y = mean_occ, color = day_type)) +
geom_line(size = 1) +
geom_point() +
labs(title = "Average Occupancy per Hour: Weekday vs Weekend",
x = "Hour", y = "Mean Occupancy")

## b. Occupancy vs Energy (Weekday vs Weekend)
ggplot(data_all, aes(x = occupancy, y = total_energy, color = day_type)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm") +
labs(title = "Occupancy vs Energy: Weekday vs Weekend")
## `geom_smooth()` using formula = 'y ~ x'

## c. Korelasi per kategori
cor_by_type <- data_all %>%
group_by(day_type) %>%
summarise(cor_val = cor(occupancy, total_energy, use="complete.obs"))
print(cor_by_type)
## # A tibble: 2 × 2
## day_type cor_val
## <chr> <dbl>
## 1 Weekday 0.885
## 2 Weekend 0.902