library(dplyr)
library(lubridate)
library(stringr)
library(zoo)
library(ggplot2)

Data Integration

Extract

# load data
df1 <- read.csv("library1.csv")
df2 <- read.csv("library2.csv")
df3 <- read.csv("library3.csv")
df4 <- read.csv("wifi.csv")

# Menunjukkan 6 data pertama
head(df1)

##                    ts              name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442   KWh    1489442   NA
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449   KWh    1489449    7
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456   KWh    1489456    7
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464   KWh    1489464    8
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471   KWh    1489471    7
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479   KWh    1489479    8

head(df2)

##                    ts               name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M11R2056 2129016   KWh    2129016   NA
## 2 2020-01-01 00:10:00 MC065-L01/M11R2056 2129034   KWh    2129034   18
## 3 2020-01-01 00:20:00 MC065-L01/M11R2056 2129054   KWh    2129054   20
## 4 2020-01-01 00:30:00 MC065-L01/M11R2056 2129071   KWh    2129071   17
## 5 2020-01-01 00:40:00 MC065-L01/M11R2056 2129086   KWh    2129086   15
## 6 2020-01-01 00:50:00 MC065-L01/M11R2056 2129103   KWh    2129103   17

head(df3)

##                    ts               name reading units cumulative rate
## 1 2020-01-01 00:00:00 MC065-L01/M13R2064 6914209   KWh    6914209   NA
## 2 2020-01-01 00:10:00 MC065-L01/M13R2064 6914266   KWh    6914266   57
## 3 2020-01-01 00:20:00 MC065-L01/M13R2064 6914310   KWh    6914310   44
## 4 2020-01-01 00:30:00 MC065-L01/M13R2064 6914376   KWh    6914376   66
## 5 2020-01-01 00:40:00 MC065-L01/M13R2064 6914439   KWh    6914439   63
## 6 2020-01-01 00:50:00 MC065-L01/M13R2064 6914474   KWh    6914474   35

head(df4)

##                  time                   Event.Time Associated.Client.Count
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                     184
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                       6
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      18
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      23
## 5 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      45
## 6 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      16
##   Authenticated.Client.Count                   Uni                 Building
## 1                        182 Lancaster University         Graduate College 
## 2                          6 Lancaster University        Management School 
## 3                         18 Lancaster University             SW hse 32-33 
## 4                         23 Lancaster University                SW hse 29 
## 5                         45 Lancaster University            Furness outer 
## 6                         16 Lancaster University   Slaidburn House (LUSU) 
##      Floor
## 1  A Floor
## 2  C Floor
## 3  D Floor
## 4  B floor
## 5  C floor
## 6  E floor

# Melihat struktur dari keempat dataset
str(df1)

## 'data.frame':    18864 obs. of  6 variables:
##  $ ts        : chr  "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
##  $ name      : chr  "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" "MC065-L01/M9R2048" ...
##  $ reading   : num  1489442 1489449 1489456 1489464 1489471 ...
##  $ units     : chr  "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num  1489442 1489449 1489456 1489464 1489471 ...
##  $ rate      : num  NA 7 7 8 7 8 7 8 7 8 ...

str(df2)

## 'data.frame':    18864 obs. of  6 variables:
##  $ ts        : chr  "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
##  $ name      : chr  "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" "MC065-L01/M11R2056" ...
##  $ reading   : num  2129016 2129034 2129054 2129071 2129086 ...
##  $ units     : chr  "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num  2129016 2129034 2129054 2129071 2129086 ...
##  $ rate      : num  NA 18 20 17 15 17 17 17 17 17 ...

str(df3)

## 'data.frame':    18864 obs. of  6 variables:
##  $ ts        : chr  "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" "2020-01-01 00:30:00" ...
##  $ name      : chr  "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" "MC065-L01/M13R2064" ...
##  $ reading   : num  6914209 6914266 6914310 6914376 6914439 ...
##  $ units     : chr  "KWh" "KWh" "KWh" "KWh" ...
##  $ cumulative: num  6914209 6914266 6914310 6914376 6914439 ...
##  $ rate      : num  NA 57 44 66 63 35 47 55 34 41 ...

str(df4)

## 'data.frame':    1883844 obs. of  7 variables:
##  $ time                      : chr  "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" "2020-02-01 00:02:12" ...
##  $ Event.Time                : chr  "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" "Sat Feb 01 00:02:12 UTC 2020" ...
##  $ Associated.Client.Count   : int  184 6 18 23 45 16 32 6 73 5 ...
##  $ Authenticated.Client.Count: int  182 6 18 23 45 16 32 6 73 5 ...
##  $ Uni                       : chr  "Lancaster University " "Lancaster University " "Lancaster University " "Lancaster University " ...
##  $ Building                  : chr  " Graduate College " " Management School " " SW hse 32-33 " " SW hse 29 " ...
##  $ Floor                     : chr  " A Floor" " C Floor" " D Floor" " B floor" ...

Transform

Filtering & Synchronizing The Data

# Memastikan kolom ts/time menjadi datetime lalu mengurutkan ts/time
df1 <- df1 %>%
  mutate(ts = ymd_hms(ts)) %>%  
  arrange(ts)

df2 <- df2 %>%
  mutate(ts = ymd_hms(ts)) %>%
  arrange(ts)

df3 <- df3 %>%
  mutate(ts = ymd_hms(ts)) %>%
  arrange(ts)

df4 <- df4 %>%
  mutate(time = ymd_hms(time)) %>%
  arrange(time)

# Filter untuk menggunakan data 'Library' saja pada kolom 'Building'
df4 <- df4 %>%
  filter(str_trim(Building) == "Library")  # str_trim untuk hapus spasi berlebih pada " Library "
head(df4)

##                  time                   Event.Time Associated.Client.Count
## 1 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      29
## 2 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                       0
## 3 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      21
## 4 2020-02-01 00:02:12 Sat Feb 01 00:02:12 UTC 2020                      38
## 5 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                       0
## 6 2020-02-01 00:07:34 Sat Feb 01 00:07:34 UTC 2020                      33
##   Authenticated.Client.Count                   Uni  Building     Floor
## 1                         28 Lancaster University   Library    A floor
## 2                          0 Lancaster University   Library   LG floor
## 3                         20 Lancaster University   Library    C floor
## 4                         37 Lancaster University   Library    B floor
## 5                          0 Lancaster University   Library   LG floor
## 6                         31 Lancaster University   Library    A floor

# Resample WiFi data (mean dari Associated Client Count) per 10 menit
wifi_resampled <- df4 %>%
  group_by(time = floor_date(time, "10 minutes")) %>% # membulatkan waktu ke interval 10 menit
  summarise(library_occupancy = mean(`Associated.Client.Count`, na.rm = TRUE)) %>% # data merupakan mean dari interval 10 menit
  ungroup()

head(wifi_resampled)

## # A tibble: 6 × 2
##   time                library_occupancy
##   <dttm>                          <dbl>
## 1 2020-02-01 00:00:00              22.8
## 2 2020-02-01 00:10:00              20.6
## 3 2020-02-01 00:20:00              17.2
## 4 2020-02-01 00:30:00              15.4
## 5 2020-02-01 00:40:00              12.4
## 6 2020-02-01 00:50:00              10.5

# Menggabungkan ketiga library meters menjadi satu dataframe
library_energy <- data.frame(
  time   = df1$ts,
  rate_1 = df1$rate,
  rate_2 = df2$rate,
  rate_3 = df3$rate
)
head(library_energy)

##                  time rate_1 rate_2 rate_3
## 1 2020-01-01 00:00:00     NA     NA     NA
## 2 2020-01-01 00:10:00      7     18     57
## 3 2020-01-01 00:20:00      7     20     44
## 4 2020-01-01 00:30:00      8     17     66
## 5 2020-01-01 00:40:00      7     15     63
## 6 2020-01-01 00:50:00      8     17     35

# Data merging
data <- library_energy %>%
  inner_join(wifi_resampled, by = "time") # key join kolom 'time'

head(data)

##                  time rate_1 rate_2 rate_3 library_occupancy
## 1 2020-02-01 00:00:00     NA     NA     NA            22.750
## 2 2020-02-01 00:10:00     13     11     73            20.625
## 3 2020-02-01 00:20:00     13     11     72            17.250
## 4 2020-02-01 00:30:00     11     15     74            15.375
## 5 2020-02-01 00:40:00     11     13     65            12.375
## 6 2020-02-01 00:50:00     11     16     64            10.500

Missing Values & Energy Consumption Aggregation

colSums(is.na(data))

##              time            rate_1            rate_2            rate_3 
##                 0                 1                 1                 1 
## library_occupancy 
##                 0

# Imputasi masing-masing rate dengan mean dari 144 observasi pertama
for (col in c("rate_1", "rate_2", "rate_3")) {
  mean_val <- mean(data[[col]][1:144], na.rm = TRUE)
  data[[col]][is.na(data[[col]])] <- mean_val
}

head(data)

##                  time   rate_1   rate_2   rate_3 library_occupancy
## 1 2020-02-01 00:00:00 15.83916 24.34965 82.60839            22.750
## 2 2020-02-01 00:10:00 13.00000 11.00000 73.00000            20.625
## 3 2020-02-01 00:20:00 13.00000 11.00000 72.00000            17.250
## 4 2020-02-01 00:30:00 11.00000 15.00000 74.00000            15.375
## 5 2020-02-01 00:40:00 11.00000 13.00000 65.00000            12.375
## 6 2020-02-01 00:50:00 11.00000 16.00000 64.00000            10.500

# Agregasi ketiga library meter
data <- data %>%
  mutate(total_rate = rate_1 + rate_2 + rate_3)

head(data)

##                  time   rate_1   rate_2   rate_3 library_occupancy total_rate
## 1 2020-02-01 00:00:00 15.83916 24.34965 82.60839            22.750   122.7972
## 2 2020-02-01 00:10:00 13.00000 11.00000 73.00000            20.625    97.0000
## 3 2020-02-01 00:20:00 13.00000 11.00000 72.00000            17.250    96.0000
## 4 2020-02-01 00:30:00 11.00000 15.00000 74.00000            15.375   100.0000
## 5 2020-02-01 00:40:00 11.00000 13.00000 65.00000            12.375    89.0000
## 6 2020-02-01 00:50:00 11.00000 16.00000 64.00000            10.500    91.0000

Duplicate

sum(duplicated(data))

## [1] 0

Outlier

# Deteksi outlier menggunakan IQR
find_outliers <- function(vec) {
  q1 <- quantile(vec, 0.25, na.rm = TRUE)
  q3 <- quantile(vec, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  cut_off <- 1.5 * iqr
  minimum <- q1 - cut_off
  maximum <- q3 + cut_off
  outliers <- vec[vec < minimum | vec > maximum]
  return(outliers)
}

find_outliers(data$total_rate)

## numeric(0)

find_outliers(data$library_occupancy)

## numeric(0)

Load

# Simpan ke CSV
write.csv(data, "Clean Data.csv", row.names = FALSE)

Visualisasi

#stacked time series
library(ggplot2)
ggplot() +
  geom_area(data = data, aes(x = time, y = library_occupancy),
            fill = "orange", alpha = 0.8) +
  geom_area(data = data, aes(x = time, y = total_rate),
            fill = "blue", alpha = 0.8) +
  labs(
    title = "Stacked Time Series: Energy Consumption vs Library Occupancy",
    x = "Time",
    y = "Value"
  ) +
 theme_minimal()

#Scatter Plot: occupancy (X) vs energy consumption 
ggplot(data, aes(x = library_occupancy, y = total_rate)) +
  geom_point(alpha = 0.5, color = "blue") +
  labs(
    title = "Scatter Plot: Occupancy vs Energy Consumption",
    x = "Library Occupancy",
    y = "Energy Consumption"
  ) +
 theme_minimal()

#daily profiles energy consumption
data %>%
  mutate(
    hour = as.POSIXct(format(time, "%H:%M"), format = "%H:%M")
  ) %>%
  ggplot(aes(x = hour, y = total_rate, group = as.Date(time))) +
  geom_line(color = "gray", alpha = 0.4) +                
  stat_summary(fun = mean, geom = "line", color = "blue", 
               aes(group = 1), size = 1.2) +
  scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hours") + #biar format hh:mm per 2h
  labs(
    title = "Daily Profiles: Energy Consumption",
    x = "Hour of Day",
    y = "Mean Energy Consumption"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  #biar label gk numpuk

#library occupancy daily profiles
data %>%
  mutate(
    hour = as.POSIXct(format(time, "%H:%M"), format = "%H:%M")
  ) %>%
  ggplot(aes(x = hour, y = library_occupancy, group = as.Date(time))) +
  geom_line(color = "gray", alpha = 0.4) +                
  stat_summary(fun = mean, geom = "line", color = "darkgreen", 
               aes(group = 1), size = 1.2) +
  scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hours") + #biar format hh:mm per 2h
  labs(
    title = "Daily Profiles: Library Occupancy",
    x = "Hour of Day",
    y = "Mean Library Occupancy"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) #biar label gk numpuk

Analisis

# Library GGPlot
library(ggplot2)

# Peak Hours of Occupancy
# Rata-rata occupancy per jam per hari
hourly_occ <- df4 %>%
  group_by(hour = lubridate::floor_date(time, "hour")) %>%
  summarise(avg_occ = mean(Associated.Client.Count, na.rm = TRUE)) %>%
  ungroup()

head(hourly_occ)

## # A tibble: 6 × 2
##   hour                avg_occ
##   <dttm>                <dbl>
## 1 2020-02-01 00:00:00   16.5 
## 2 2020-02-01 01:00:00    8.27
## 3 2020-02-01 02:00:00    7.04
## 4 2020-02-01 03:00:00    7.21
## 5 2020-02-01 04:00:00    7.12
## 6 2020-02-01 05:00:00    6.60

# Rata-rata occupancy per 10 menit per hari
occ_10min <- df4 %>%
  group_by(time = lubridate::floor_date(time, "10 minutes")) %>%
  summarise(avg_occ = mean(Associated.Client.Count, na.rm = TRUE)) %>%
  ungroup()

# Visualisasi pola occupancy per jam
ggplot(hourly_occ, aes(x = hour, y = avg_occ)) +
  geom_line(color = "black") +
  labs(title = "Average Hourly Occupancy in Library",
       x = "Hour of Day",
       y = "Average Associated Client Count") +
  theme_minimal()

# Top 3 peak occupancy hours 
top3 <- occ_10min %>%
  arrange(desc(avg_occ)) %>%
  head(3)
cat("Top 3 peak occupancy hours:\n")

## Top 3 peak occupancy hours:

print(top3)

## # A tibble: 3 × 2
##   time                avg_occ
##   <dttm>                <dbl>
## 1 2020-02-25 15:20:00    512.
## 2 2020-02-25 15:30:00    512.
## 3 2020-02-18 14:40:00    509

# Occupancy influences energy?
# Variabel
occ <- data$library_occupancy
eng <- data$total_rate

# Pearson correlation
cor_test <- cor.test(occ, eng, method = "pearson")
print(cor_test)

## 
##  Pearson's product-moment correlation
## 
## data:  occ and eng
## t = 111.43, df = 3681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8706546 0.8854374
## sample estimates:
##       cor 
## 0.8782556

# OLS Regression (Energy ~ Occupancy)
model <- lm(total_rate ~ library_occupancy, data = data)
summary(model)

## 
## Call:
## lm(formula = total_rate ~ library_occupancy, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.328 -15.549   1.796  16.218  50.933 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.037e+02  4.940e-01   210.0   <2e-16 ***
## library_occupancy 2.668e-01  2.395e-03   111.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.89 on 3681 degrees of freedom
## Multiple R-squared:  0.7713, Adjusted R-squared:  0.7713 
## F-statistic: 1.242e+04 on 1 and 3681 DF,  p-value: < 2.2e-16

# Scatter plot
ggplot(data, aes(x = library_occupancy, y = total_rate)) +
  geom_point(alpha = 0.4, color = "blue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    title = "Occupancy vs Energy Consumption",
    x = "Library Occupancy (WiFi count)",
    y = "Total Energy Consumption (rate)"
  ) +
 theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Highlight Cases Energy Does Not Align Occupancy
# Data Prediksi
data$predicted_energy <- predict(model, newdata = data)

pred <- data$predicted_energy

# Hitung residuals
data$residual <- eng - pred
residual <- data$residual

# Tentukan threshold anomali
threshold <- 1.5 * sd(residual, na.rm = TRUE)

# Tandai anomalies
anomalies <- data[abs(residual) > threshold, ]

cat("Total anomalies found:", nrow(anomalies), "\n")

## Total anomalies found: 487

# Visualisasi
ggplot(data, aes(x = library_occupancy, y = total_rate)) +
  geom_point(alpha = 0.4, color = "blue") +
  geom_point(data = anomalies, aes(x = library_occupancy, y = total_rate),
             color = "red", size = 3, shape = 21, fill = "red") +
  geom_line(aes(y = predicted_energy), color = "green", size = 1) +
  labs(
    title = "Energy vs Occupancy with Anomalies Highlighted",
    x = "Library Occupancy",
    y = "Total Energy Consumption"
  ) +
 theme_minimal()

Weekday vs Weekend Comparison

data <- data %>%
  mutate(
    date = as.Date(time),
    hour = hour(time),
    dayofweek = wday(time, week_start = 1),
    is_weekend = dayofweek >= 6
  )

ggplot(data, aes(x = library_occupancy, y = total_rate, color = is_weekend)) +
  geom_point(alpha = 0.5) +
  scale_color_manual(
    name = "Kategori Hari", 
    values = c("FALSE" = "blue", "TRUE" = "red"),
    labels = c("Weekday", "Weekend")
  ) +
  labs(
    title = "Occupancy vs Energy Consumption (Weekday vs Weekend)",
    x = "Occupancy",
    y = "Energy Consumption"
  ) +
  theme_minimal() +
  theme(legend.position = "top")

energy_profiles <- data %>%
  group_by(is_weekend, date, hour) %>%
  summarise(total_rate = mean(total_rate, na.rm = TRUE), .groups = 'drop')

energy_mean <- energy_profiles %>%
  group_by(is_weekend, hour) %>%
  summarise(total_rate = mean(total_rate, na.rm = TRUE), .groups = 'drop')
  
occ_profiles <- data %>%
  group_by(is_weekend, date, hour) %>%
  summarise(library_occupancy = mean(library_occupancy, na.rm = TRUE), .groups = 'drop')

occ_mean <- occ_profiles %>%
  group_by(is_weekend, hour) %>%
  summarise(library_occupancy = mean(library_occupancy, na.rm = TRUE), .groups = 'drop')

ggplot() +
  geom_line(data = energy_profiles, 
            aes(x = hour, y = total_rate, group = date, color = is_weekend), 
            alpha = 0.2) +
  geom_line(data = energy_mean, 
            aes(x = hour, y = total_rate, color = is_weekend), 
            linewidth = 1.5) +
  scale_color_manual(
    name = "Kategori Hari", 
    values = c("FALSE" = "blue", "TRUE" = "red"),
    labels = c("Weekday", "Weekend")
  ) +
  labs(
    title = "Weekday vs Weekend: Daily Profiles (Energy Consumption)",
    x = "Hour of Day",
    y = "Energy Consumption"
  ) +
  scale_x_continuous(breaks = seq(0, 23, 2)) +
  theme_minimal() +
  theme(legend.position = "top")

ggplot() +
  geom_line(data = occ_profiles, 
            aes(x = hour, y = library_occupancy, group = date, color = is_weekend), 
            alpha = 0.2) +
  geom_line(data = occ_mean, 
            aes(x = hour, y = library_occupancy, color = is_weekend), 
            linewidth = 1.5) +
  scale_color_manual(
    name = "Kategori Hari", 
    values = c("FALSE" = "blue", "TRUE" = "red"),
    labels = c("Weekday", "Weekend")
  ) +
  labs(
    title = "Weekday vs Weekend: Daily Profiles (Occupancy)",
    x = "Hour of Day",
    y = "Occupancy"
  ) +
  scale_x_continuous(breaks = seq(0, 23, 2)) +
  theme_minimal() +
  theme(legend.position = "top")

Data Mining AdventureR

Kelompok 9: Aditya_075, Prayoga_085, Ananda_090, Thival_195

Data Integration

Extract

Transform

Filtering & Synchronizing The Data

Missing Values & Energy Consumption Aggregation

Duplicate

Outlier

Load

Visualisasi

Analisis

Weekday vs Weekend Comparison