Try DMV K

Load dataset

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(readr)
library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

wifi <- read_csv("D:/wifi.csv")

## Rows: 1883844 Columns: 7

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Event Time, Uni, Building, Floor
## dbl  (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib1 <- read_csv("D:/library1.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib2 <- read_csv("D:/library2.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib3 <- read_csv("D:/library3.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Cek Missing Value dari dataset library

sum(is.na(ymd_hms(lib1$ts)))

## Warning: 131 failed to parse.

## [1] 131

sum(is.na(ymd_hms(lib2$ts)))

## Warning: 131 failed to parse.

## [1] 131

sum(is.na(ymd_hms(lib3$ts)))

## Warning: 131 failed to parse.

## [1] 131

Integrasi variabel ts

library(lubridate)

wifi <- wifi %>% 
  mutate(time = parse_date_time(time, orders = c('ymd HMS', 'ymd HM', 'ymd')))

lib1 <- lib1 %>% 
  mutate(ts = parse_date_time(ts, orders = c("ymd HMS", "ymd HM", "ymd")))

lib2 <- lib2 %>% 
  mutate(ts = parse_date_time(ts, orders = c("ymd HMS", "ymd HM", "ymd")))

lib3 <- lib3 %>% 
  mutate(ts = parse_date_time(ts, orders = c('ymd HMS', 'ymd HM', 'ymd')))


#------Cek Nilai ts yang missing------
sum(is.na(lib1$ts))

## [1] 0

sum(is.na(lib2$ts))

## [1] 0

sum(is.na(lib3$ts))

## [1] 0

sum(is.na(wifi$time))

## [1] 0

Cek Persentase Missing Value

#--------Plot data-------
library(ggplot2)
library(imputeTS)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## 
## Attaching package: 'imputeTS'

## The following object is masked from 'package:zoo':
## 
##     na.locf

library(naniar)

missing1 <- (sum(is.na(lib1$rate)) / nrow(lib1)) *100
missing1

## [1] 16.15246

missing2 <- (sum(is.na(lib2$rate)) / nrow(lib2)) *100
missing2

## [1] 16.15246

missing3 <- (sum(is.na(lib3$rate)) / nrow(lib3)) *100
missing3

## [1] 16.15246

miss_var_summary(wifi)

Imputasi dengan mean 144 obs pertama

fill_with_mean144 <- function(x) {
  mean144 <- mean(head(na.omit(x), 144))
  x[is.na(x)] <- mean144
  return(x)
}

lib1 <- lib1 %>% mutate(rate = fill_with_mean144(rate))
lib2 <- lib2 %>% mutate(rate = fill_with_mean144(rate))
lib3 <- lib3 %>% mutate(rate = fill_with_mean144(rate))

Hapus Duplikasi Data

wifi <- distinct(wifi)
lib1 <- distinct(lib1)
lib2 <- distinct(lib2)
lib3 <- distinct(lib3)

Filter & Sinkronisasi

Resample WiFi ke 10 menit (mean)

# Filter hanya WiFi di Library
wifi_lib <- wifi %>% 
  filter(Building == "Library")

# Resample ke 10 menit

wifi_10min <- wifi_lib %>%
  mutate(time = floor_date(time, "10 minutes")) %>%
  group_by(time) %>%
  summarise(occupancy = mean(`Associated Client Count`, na.rm = TRUE))

Resample energi (sum per 10 menit)

lib1_10min <- lib1 %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(rate1 = sum(rate, na.rm = TRUE))

lib2_10min <- lib2 %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(rate2 = sum(rate, na.rm = TRUE))

lib3_10min <- lib3 %>%
  mutate(ts = floor_date(ts, "10 minutes")) %>%
  group_by(ts) %>%
  summarise(rate3 = sum(rate, na.rm = TRUE))

Gabung semua rate jadi total

energy_10min <- lib1_10min %>%
  inner_join(lib2_10min, by = "ts") %>%
  inner_join(lib3_10min, by = "ts") %>%
  mutate(total_energy = rate1 + rate2 + rate3)

Gabung dataset wifi dan Rate

data_all <- wifi_10min %>%
  inner_join(energy_10min, by = c("time" = "ts"))

Visualisasi

# Tambah variabel hour dan day_type
data_all <- data_all %>%
  mutate(hour = hour(time),
         day_type = ifelse(wday(time) %in% c(1,7), "Weekend", "Weekday"))

# 1. Time series Occupancy vs Energy
ggplot(data_all, aes(x = time)) +
  geom_line(aes(y = occupancy, color = "Occupancy")) +
  geom_line(aes(y = total_energy, color = "Energy")) +
  labs(title = "Time Series Occupancy vs Energy",
       y = "Value", color = "Legend")

# 2. Scatter plot
ggplot(data_all, aes(x = occupancy, y = total_energy)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Scatter Plot: Occupancy vs Energy",
       x = "Occupancy", y = "Energy")

## `geom_smooth()` using formula = 'y ~ x'

# 3. Daily profile (average per hour per day)
data_daily <- data_all %>%
  mutate(date = as.Date(time)) %>%
  group_by(date, hour) %>%
  summarise(occ = mean(occupancy), energy = mean(total_energy), .groups="drop")

ggplot(data_daily, aes(x = hour, y = occ, group = date)) +
  geom_line(alpha = 0.3) +
  stat_summary(fun = mean, geom = "line", color = "red", size = 1.2) +
  labs(title = "Daily Occupancy Profile (red = average)",
       x = "Hour of Day", y = "Occupancy")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 4. Peak hours of occupancy (average per hour)
peak_occ <- data_all %>%
  group_by(hour) %>%
  summarise(mean_occ = mean(occupancy, na.rm = TRUE))

ggplot(peak_occ, aes(x = hour, y = mean_occ)) +
  geom_line(color = "blue") +
  geom_point() +
  labs(title = "Average Occupancy per Hour (All Days)",
       x = "Hour of Day", y = "Mean Occupancy")

# 5. Korelasi numerik
cor_test <- cor.test(data_all$occupancy, data_all$total_energy, use = "complete.obs")
print(cor_test)

## 
##  Pearson's product-moment correlation
## 
## data:  data_all$occupancy and data_all$total_energy
## t = 111.41, df = 3681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8706080 0.8853958
## sample estimates:
##       cor 
## 0.8782116

# 6. Highlight anomaly cases
anomalies <- data_all %>%
  filter((occupancy < 10 & total_energy > quantile(total_energy, 0.9, na.rm=TRUE)) |
         (occupancy > quantile(occupancy, 0.9, na.rm=TRUE) & total_energy < quantile(total_energy, 0.1, na.rm=TRUE)))

ggplot(data_all, aes(x = occupancy, y = total_energy)) +
  geom_point(alpha = 0.3) +
  geom_point(data = anomalies, aes(x = occupancy, y = total_energy), color = "red", size = 2) +
  labs(title = "Anomalies: Energy vs Occupancy")

# 7. Weekend vs Weekday Comparison
## a. Peak Occupancy
peak_by_type <- data_all %>%
  group_by(day_type, hour) %>%
  summarise(mean_occ = mean(occupancy, na.rm = TRUE), .groups = "drop")

ggplot(peak_by_type, aes(x = hour, y = mean_occ, color = day_type)) +
  geom_line(size = 1) +
  geom_point() +
  labs(title = "Average Occupancy per Hour: Weekday vs Weekend",
       x = "Hour", y = "Mean Occupancy")

## b. Occupancy vs Energy (Weekday vs Weekend)
ggplot(data_all, aes(x = occupancy, y = total_energy, color = day_type)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm") +
  labs(title = "Occupancy vs Energy: Weekday vs Weekend")

## `geom_smooth()` using formula = 'y ~ x'

## c. Korelasi per kategori
cor_by_type <- data_all %>%
  group_by(day_type) %>%
  summarise(cor_val = cor(occupancy, total_energy, use="complete.obs"))
print(cor_by_type)

## # A tibble: 2 × 2
##   day_type cor_val
##   <chr>      <dbl>
## 1 Weekday    0.885
## 2 Weekend    0.902