R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Import Library

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'tibble' was built under R version 4.3.3

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## Warning: package 'purrr' was built under R version 4.3.3

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'stringr' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(hms)

## Warning: package 'hms' was built under R version 4.3.3

## 
## Attaching package: 'hms'
## 
## The following object is masked from 'package:lubridate':
## 
##     hms

Filter & Synchronize the Data

Use only Library building WiFi records

wifi <- read_csv("C:/Users/ZUMROTUS SOLICHA/Documents/MK SEMESTER 5/DATMIN/wifi.csv") %>%
  filter(str_trim(Building) == "Library") %>%
  select(time, `Associated Client Count`)

## Rows: 1883844 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Event Time, Uni, Building, Floor
## dbl  (2): Associated Client Count, Authenticated Client Count
## dttm (1): time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print("WiFi data (Library only):")

## [1] "WiFi data (Library only):"

print(head(wifi))

## # A tibble: 6 × 2
##   time                `Associated Client Count`
##   <dttm>                                  <dbl>
## 1 2020-02-01 00:02:12                        29
## 2 2020-02-01 00:02:12                         0
## 3 2020-02-01 00:02:12                        21
## 4 2020-02-01 00:02:12                        38
## 5 2020-02-01 00:07:34                         0
## 6 2020-02-01 00:07:34                        33

Ensure all data is standardized at 10-minute intervals

wifi <- wifi %>%
  mutate(time = ymd_hms(time)) %>%
  mutate(time = floor_date(time, "10 minutes"))
print("WiFi data after standardizing to 10-minute intervals:")

## [1] "WiFi data after standardizing to 10-minute intervals:"

print(head(wifi))

## # A tibble: 6 × 2
##   time                `Associated Client Count`
##   <dttm>                                  <dbl>
## 1 2020-02-01 00:00:00                        29
## 2 2020-02-01 00:00:00                         0
## 3 2020-02-01 00:00:00                        21
## 4 2020-02-01 00:00:00                        38
## 5 2020-02-01 00:00:00                         0
## 6 2020-02-01 00:00:00                        33

Resample WiFi data (mean of Associated Client Count)

wifi <- wifi %>%
  group_by(time) %>%
  summarise(mean_client_count = mean(`Associated Client Count`, na.rm = TRUE)) %>%
  ungroup()

print("WiFi resampled (mean client count per 10 minutes):")

## [1] "WiFi resampled (mean client count per 10 minutes):"

print(head(wifi))

## # A tibble: 6 × 2
##   time                mean_client_count
##   <dttm>                          <dbl>
## 1 2020-02-01 00:00:00              22.8
## 2 2020-02-01 00:10:00              20.6
## 3 2020-02-01 00:20:00              17.2
## 4 2020-02-01 00:30:00              15.4
## 5 2020-02-01 00:40:00              12.4
## 6 2020-02-01 00:50:00              10.5

Aggregate library meters into total electricity rate per 10 minutes

library(readr)

lib1 <- read_csv("C:/Users/ZUMROTUS SOLICHA/Documents/MK SEMESTER 5/DATMIN/library1.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib2 <- read_csv("C:/Users/ZUMROTUS SOLICHA/Documents/MK SEMESTER 5/DATMIN/library2.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

lib3 <- read_csv("C:/Users/ZUMROTUS SOLICHA/Documents/MK SEMESTER 5/DATMIN/library3.csv")

## Rows: 18864 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): name, units
## dbl  (3): reading, cumulative, rate
## dttm (1): ts
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print("Struktur timestamp dari masing-masing listrik:")

## [1] "Struktur timestamp dari masing-masing listrik:"

print(str(lib1$ts))

##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL

print(str(lib2$ts))

##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL

print(str(lib3$ts))

##  POSIXct[1:18864], format: "2020-01-01 00:00:00" "2020-01-01 00:10:00" "2020-01-01 00:20:00" ...
## NULL

Data Cleaning

Address missing electricity readings

impute_with_first144 <- function(df, col_name) {
  mean_val <- mean(df[[col_name]][1:144], na.rm = TRUE)
  df[[col_name]][is.na(df[[col_name]])] <- mean_val
  return(df)
}

lib1 <- impute_with_first144(lib1, "rate")
lib2 <- impute_with_first144(lib2, "rate")
lib3 <- impute_with_first144(lib3, "rate")

print("Contoh data listrik setelah imputasi missing value:")

## [1] "Contoh data listrik setelah imputasi missing value:"

print(head(lib1))

## # A tibble: 6 × 6
##   ts                  name              reading units cumulative  rate
##   <dttm>              <chr>               <dbl> <chr>      <dbl> <dbl>
## 1 2020-01-01 00:00:00 MC065-L01/M9R2048 1489442 KWh      1489442  7.38
## 2 2020-01-01 00:10:00 MC065-L01/M9R2048 1489449 KWh      1489449  7   
## 3 2020-01-01 00:20:00 MC065-L01/M9R2048 1489456 KWh      1489456  7   
## 4 2020-01-01 00:30:00 MC065-L01/M9R2048 1489464 KWh      1489464  8   
## 5 2020-01-01 00:40:00 MC065-L01/M9R2048 1489471 KWh      1489471  7   
## 6 2020-01-01 00:50:00 MC065-L01/M9R2048 1489479 KWh      1489479  8

Standardize timestamps and remove duplicates

library(dplyr)

electricity <- full_join(lib1, lib2, by = "ts", suffix = c("_1", "_2")) %>%
  full_join(lib3, by = "ts") %>%
  distinct()

# Hitung total electricity
electricity <- electricity %>%
  mutate(total_electricity = rate_1 + rate_2 + rate)

print("2b. Data listrik gabungan + total:")

## [1] "2b. Data listrik gabungan + total:"

print(head(electricity))

## # A tibble: 6 × 17
##   ts                  name_1        reading_1 units_1 cumulative_1 rate_1 name_2
##   <dttm>              <chr>             <dbl> <chr>          <dbl>  <dbl> <chr> 
## 1 2020-01-01 00:00:00 MC065-L01/M9…   1489442 KWh          1489442   7.38 MC065…
## 2 2020-01-01 00:10:00 MC065-L01/M9…   1489449 KWh          1489449   7    MC065…
## 3 2020-01-01 00:20:00 MC065-L01/M9…   1489456 KWh          1489456   7    MC065…
## 4 2020-01-01 00:30:00 MC065-L01/M9…   1489464 KWh          1489464   8    MC065…
## 5 2020-01-01 00:40:00 MC065-L01/M9…   1489471 KWh          1489471   7    MC065…
## 6 2020-01-01 00:50:00 MC065-L01/M9…   1489479 KWh          1489479   8    MC065…
## # ℹ 10 more variables: reading_2 <dbl>, units_2 <chr>, cumulative_2 <dbl>,
## #   rate_2 <dbl>, name <chr>, reading <dbl>, units <chr>, cumulative <dbl>,
## #   rate <dbl>, total_electricity <dbl>

Visualization of Occupancy–Energy Relationship

Gabungkan wifi & listrik

library(dplyr)

wifi <- wifi %>% rename(ts = time)
df <- inner_join(wifi, electricity, by = "ts")

print("Data gabungan WiFi dan listrik:")

## [1] "Data gabungan WiFi dan listrik:"

print(head(df))

## # A tibble: 6 × 18
##   ts                  mean_client_count name_1    reading_1 units_1 cumulative_1
##   <dttm>                          <dbl> <chr>         <dbl> <chr>          <dbl>
## 1 2020-02-01 00:00:00              22.8 MC065-L0…   1558154 KWh          1558154
## 2 2020-02-01 00:10:00              20.6 MC065-L0…   1558167 KWh          1558167
## 3 2020-02-01 00:20:00              17.2 MC065-L0…   1558180 KWh          1558180
## 4 2020-02-01 00:30:00              15.4 MC065-L0…   1558191 KWh          1558191
## 5 2020-02-01 00:40:00              12.4 MC065-L0…   1558202 KWh          1558202
## 6 2020-02-01 00:50:00              10.5 MC065-L0…   1558213 KWh          1558213
## # ℹ 12 more variables: rate_1 <dbl>, name_2 <chr>, reading_2 <dbl>,
## #   units_2 <chr>, cumulative_2 <dbl>, rate_2 <dbl>, name <chr>, reading <dbl>,
## #   units <chr>, cumulative <dbl>, rate <dbl>, total_electricity <dbl>

Time Series Plots

print("Plot Time Series Occupancy vs Electricity")

## [1] "Plot Time Series Occupancy vs Electricity"

library(ggplot2)

ggplot(df, aes(x = ts)) +
  geom_line(aes(y = mean_client_count, color = "Occupancy")) +
  geom_line(aes(y = total_electricity, color = "Electricity")) +
  labs(title = "Time Series: Occupancy vs Electricity",
       x = "Time", y = "Value", color = "Legend") +
  theme_minimal()

Scatter Plot

print("Scatter Plot Occupancy vs Electricity")

## [1] "Scatter Plot Occupancy vs Electricity"

library(ggplot2)

ggplot(df, aes(x = mean_client_count, y = total_electricity)) +
  geom_point(alpha = 0.5, color = "blue") +
  labs(title = "Scatterplot: Occupancy vs Electricity",
       x = "Mean Client Count", y = "Total Electricity") +
  theme_minimal()

Daily Profiles (24h) + average line

library(dplyr)
library(lubridate)
library(ggplot2)

df <- df %>%
  mutate(date = as_date(ts),
         time_of_day = as_hms(ts))

print("Plot Daily Profile of Occupancy")

## [1] "Plot Daily Profile of Occupancy"

ggplot(df, aes(x = time_of_day, y = mean_client_count, group = date)) +
  geom_line(alpha = 0.2, color = "blue") +
  stat_summary(fun = mean, geom = "line", aes(group = 1),
               color = "red", linewidth = 1.2) +
  labs(title = "Daily Profile of Occupancy",
       x = "Time of Day", y = "Mean Client Count") +
  theme_minimal()

Analysis

Identify peak hours of occupancy

peak_hours <- df %>%
  mutate(hour = hour(ts)) %>%
  group_by(hour) %>%
  summarise(avg_occupancy = mean(mean_client_count, na.rm = TRUE)) %>%
  arrange(desc(avg_occupancy))

print("Peak hours of occupancy:")

## [1] "Peak hours of occupancy:"

print(peak_hours)

## # A tibble: 24 × 2
##     hour avg_occupancy
##    <int>         <dbl>
##  1    15          370.
##  2    14          357.
##  3    16          346.
##  4    13          331.
##  5    12          300.
##  6    17          292.
##  7    11          260.
##  8    18          233.
##  9    19          193.
## 10    10          183.
## # ℹ 14 more rows

Correlation

correlation <- cor(df$mean_client_count, df$total_electricity, use = "complete.obs")

print("Korelasi Occupancy vs Electricity:")

## [1] "Korelasi Occupancy vs Electricity:"

print(correlation)

## [1] 0.8782117

Highlight anomaly cases

print("Kasus di mana listrik tinggi tapi occupancy rendah:")

## [1] "Kasus di mana listrik tinggi tapi occupancy rendah:"

df %>%
  filter(
    mean_client_count < quantile(mean_client_count, 0.25, na.rm = TRUE) &
      total_electricity > quantile(total_electricity, 0.75, na.rm = TRUE)
  ) %>%
  head()

## # A tibble: 0 × 20
## # ℹ 20 variables: ts <dttm>, mean_client_count <dbl>, name_1 <chr>,
## #   reading_1 <dbl>, units_1 <chr>, cumulative_1 <dbl>, rate_1 <dbl>,
## #   name_2 <chr>, reading_2 <dbl>, units_2 <chr>, cumulative_2 <dbl>,
## #   rate_2 <dbl>, name <chr>, reading <dbl>, units <chr>, cumulative <dbl>,
## #   rate <dbl>, total_electricity <dbl>, date <date>, time_of_day <time>

Weekend vs Weekday Comparison

library(lubridate)
library(dplyr)

df <- df %>%
  mutate(
    weekday = wday(ts, label = TRUE), 
    is_weekend = ifelse(weekday %in% c("Sat", "Sun"), "Weekend", "Weekday")
  )

head(df)

## # A tibble: 6 × 22
##   ts                  mean_client_count name_1    reading_1 units_1 cumulative_1
##   <dttm>                          <dbl> <chr>         <dbl> <chr>          <dbl>
## 1 2020-02-01 00:00:00              22.8 MC065-L0…   1558154 KWh          1558154
## 2 2020-02-01 00:10:00              20.6 MC065-L0…   1558167 KWh          1558167
## 3 2020-02-01 00:20:00              17.2 MC065-L0…   1558180 KWh          1558180
## 4 2020-02-01 00:30:00              15.4 MC065-L0…   1558191 KWh          1558191
## 5 2020-02-01 00:40:00              12.4 MC065-L0…   1558202 KWh          1558202
## 6 2020-02-01 00:50:00              10.5 MC065-L0…   1558213 KWh          1558213
## # ℹ 16 more variables: rate_1 <dbl>, name_2 <chr>, reading_2 <dbl>,
## #   units_2 <chr>, cumulative_2 <dbl>, rate_2 <dbl>, name <chr>, reading <dbl>,
## #   units <chr>, cumulative <dbl>, rate <dbl>, total_electricity <dbl>,
## #   date <date>, time_of_day <time>, weekday <ord>, is_weekend <chr>

Occupancy Weekday vs Weekend

library(ggplot2)

ggplot(df, aes(x = ts, y = mean_client_count)) +
  geom_line(color = "blue", alpha = 0.6) +
  facet_wrap(~is_weekend, ncol = 1, scales = "free_x") +
  labs(
    title = "Occupancy: Weekday vs Weekend (Split)",
    x = "Time", 
    y = "Mean Client Count"
  ) +
  theme_minimal()

Electricity Weekday vs Weekend

library(ggplot2)

ggplot(df, aes(x = ts, y = total_electricity)) +
  geom_line(color = "red", alpha = 0.6) +
  facet_wrap(~is_weekend, ncol = 1, scales = "free_x") +
  labs(title = "Electricity: Weekday vs Weekend (Split)",
       x = "Time", y = "Total Electricity") +
  theme_minimal()

Scatter Weekday vs Weekend

library(ggplot2)

ggplot(df, aes(x = mean_client_count, y = total_electricity)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  facet_wrap(~is_weekend) +
  labs(title = "Scatter: Occupancy vs Electricity (Split by Weekday/Weekend)",
       x = "Mean Client Count", y = "Total Electricity") +
  theme_minimal()

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

DATA MINING MADNESS_KELOMPOK 6_Azzahra(196)_Zumrotus(191)_Dhafa(192)_Nayara(198)

Zumrotus Solicha

2025-09-13