Load Dataset and Find Summary of NA

# --- Step 1: Loading Libraries and Data Audit ---

library(tidyverse)
library(fpp3)
library(naniar) # Visualizing missingness

# Read the raw residential power data
url1 <- "https://raw.githubusercontent.com/uzmabb182/Data_624_Predictive_Analytics/refs/heads/main/Waterflow_Pipe1.csv"
pipe1_wf <- read_csv(url1)

# View the first few rows
head(pipe1_wf)
## # A tibble: 6 × 2
##   `Date Time`       WaterFlow
##   <chr>                 <dbl>
## 1 10/23/15 12:24 AM     23.4 
## 2 10/23/15 12:40 AM     28.0 
## 3 10/23/15 12:53 AM     23.1 
## 4 10/23/15 12:55 AM     30.0 
## 5 10/23/15 1:19 AM       6.00
## 6 10/23/15 1:23 AM      15.9
# Summary of missing values (NAs)
cat("--- Count of NAs in Waterflow Pipe1 Data ---\n")
## --- Count of NAs in Waterflow Pipe1 Data ---
print(colSums(is.na(pipe1_wf)))
## Date Time WaterFlow 
##         0         0
# --- Step 1: Loading Libraries and Data Audit ---

library(tidyverse)
library(fpp3)
library(naniar) # Visualizing missingness

# Read the raw residential power data
url2 <- "https://raw.githubusercontent.com/uzmabb182/Data_624_Predictive_Analytics/refs/heads/main/Waterflow_Pipe2.csv"
pipe2_wf <- read_csv(url2)

# View the first few rows
head(pipe2_wf)
## # A tibble: 6 × 2
##   `Date Time`      WaterFlow
##   <chr>                <dbl>
## 1 10/23/15 1:00 AM      18.8
## 2 10/23/15 2:00 AM      43.1
## 3 10/23/15 3:00 AM      38.0
## 4 10/23/15 4:00 AM      36.1
## 5 10/23/15 5:00 AM      31.9
## 6 10/23/15 6:00 AM      28.2
# Summary of missing values (NAs)
cat("--- Count of NAs in Waterflow Pipe2 Data ---\n")
## --- Count of NAs in Waterflow Pipe2 Data ---
print(colSums(is.na(pipe2_wf)))
## Date Time WaterFlow 
##         0         0

Convert time and aggregate by hour (Mean)

pipe1_hourly <- read_csv(url1) |>
  mutate(
    # Using parse_date_time to handle multiple date formats automatically
    DateTime = parse_date_time(`Date Time`, orders = c("mdy HMS", "mdy HM", "ymd HMS")),
    Hour = floor_date(DateTime, "hour")
  ) |>
  filter(!is.na(Hour)) |>
  group_by(Hour) |>
  summarise(Pipe1_Flow = mean(WaterFlow, na.rm = TRUE))
## Rows: 1000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date Time
## dbl (1): WaterFlow
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pipe2_hourly <- read_csv(url2) |>
  mutate(
    DateTime = parse_date_time(`Date Time`, orders = c("mdy HMS", "mdy HM", "ymd HMS")),
    Hour = floor_date(DateTime, "hour")
  ) |>
  filter(!is.na(Hour)) |>
  group_by(Hour) |>
  summarise(Pipe2_Flow = mean(WaterFlow, na.rm = TRUE))
## Rows: 1000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date Time
## dbl (1): WaterFlow
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the results of the aggregation
head(pipe1_hourly)
## # A tibble: 6 × 2
##   Hour                Pipe1_Flow
##   <dttm>                   <dbl>
## 1 2015-10-23 01:00:00       17.9
## 2 2015-10-23 02:00:00       22.6
## 3 2015-10-23 03:00:00       22.9
## 4 2015-10-23 04:00:00       24.5
## 5 2015-10-23 05:00:00       20.4
## 6 2015-10-23 06:00:00       20.6
head(pipe2_hourly)
## # A tibble: 6 × 2
##   Hour                Pipe2_Flow
##   <dttm>                   <dbl>
## 1 2015-10-23 01:00:00       37.9
## 2 2015-10-23 02:00:00       30.1
## 3 2015-10-23 03:00:00       36.5
## 4 2015-10-23 04:00:00       40.3
## 5 2015-10-23 05:00:00       44.6
## 6 2015-10-23 06:00:00       32.8

Merge pipes and create a unified tsibble

Using full_join to ensure a complete hourly timeline Aggregating multiple recordings within the same hour into a single mean value.

waterflow_ts <- full_join(pipe1_hourly, pipe2_hourly, by = "Hour") |>
  as_tsibble(index = Hour) |>
  fill_gaps() # This ensures no missing hours in the sequence

# View the final hourly aggregated table
head(waterflow_ts)
## # A tsibble: 6 x 3 [1h] <UTC>
##   Hour                Pipe1_Flow Pipe2_Flow
##   <dttm>                   <dbl>      <dbl>
## 1 2015-10-23 01:00:00       17.9       37.9
## 2 2015-10-23 02:00:00       22.6       30.1
## 3 2015-10-23 03:00:00       22.9       36.5
## 4 2015-10-23 04:00:00       24.5       40.3
## 5 2015-10-23 05:00:00       20.4       44.6
## 6 2015-10-23 06:00:00       20.6       32.8

Visualization and Stationarity Testing

We will plot both pipe flows to look for trends or seasonal patterns and then use the KPSS test to formally check for stationarity. This determines if we can proceed with a forecast or if the data requires transformation (like differencing).

# Visualize both pipes on the same timeline
# This helps identify if the pipes follow similar patterns or have gaps

waterflow_ts |>
  pivot_longer(cols = c(Pipe1_Flow, Pipe2_Flow), names_to = "Pipe", values_to = "Flow") |>
  autoplot(Flow) +
  labs(title = "Hourly Water Flow: Pipe 1 vs Pipe 2",
       subtitle = "Aggregated by Mean per Hour",
       y = "Water Flow Rate",
       x = "Time (Hours)") +
  theme_minimal() +
  facet_wrap(~Pipe, scales = "free_y", ncol = 1)
## Warning: Removed 768 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Perform KPSS Stationarity Test
# Null Hypothesis (H0): The data is stationary.
# If p-value < 0.05, the data is non-stationary and needs differencing.

waterflow_ts |>
  pivot_longer(cols = c(Pipe1_Flow, Pipe2_Flow), names_to = "Pipe", values_to = "Flow") |>
  filter(!is.na(Flow)) |>
  features(Flow, unitroot_kpss) |>
  knitr::kable(caption = "KPSS Stationarity Test Results for Water Flow")
KPSS Stationarity Test Results for Water Flow
Pipe kpss_stat kpss_pvalue
Pipe1_Flow 0.0935193 0.1
Pipe2_Flow 0.1164170 0.1

The KPSS test results p-value = 0.1 are good Since the p-value is greater than 0.05, we fail to reject the null hypothesis, statistically confirming that the water flow in both pipes is stationary.

This means the data is stable and ready for forecasting without needing complex differencing transformations.

Modeling and One-Week Forecast

# The ARIMA function automatically selects the best parameters for stationary data
waterflow_models <- waterflow_ts |>
  pivot_longer(cols = c(Pipe1_Flow, Pipe2_Flow), 
               names_to = "Pipe", 
               values_to = "Flow") |>
  as_tsibble(index = Hour, key = Pipe) |>
  fill_gaps() |>
  model(arima = ARIMA(Flow))

Generate the 1-week (168 hours) forecast

waterflow_forecast <- waterflow_models |>
  forecast(h = "168 hours")

Visualize the Forecast

# This displays the historical data followed by the 7-day projection
waterflow_forecast |>
  autoplot(waterflow_ts |> 
             pivot_longer(cols = c(Pipe1_Flow, Pipe2_Flow), 
                          names_to = "Pipe", values_to = "Flow") |>
             as_tsibble(index = Hour, key = Pipe), 
           level = 80) +
  facet_wrap(~Pipe, scales = "free_y", ncol = 1) +
  labs(title = "One-Week Water Flow Forecast",
       subtitle = "Stationary ARIMA Projections for Pipe 1 and Pipe 2",
       y = "Water Flow Rate",
       x = "Time") +
  theme_minimal()
## Warning: Removed 768 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Save the forecast to an Excel-readable CSV file

final_forecast_output <- waterflow_forecast |>
  as_tibble() |>
  select(Pipe, Hour, .mean) |>
  rename(Forecasted_Flow = .mean)

write_csv(final_forecast_output, "Waterflow_OneWeek_Forecast.csv")

cat("Step 3 Complete. Forecast saved to 'Waterflow_OneWeek_Forecast.csv'.")
## Step 3 Complete. Forecast saved to 'Waterflow_OneWeek_Forecast.csv'.