Project 1 - Part C

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour. Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

Import the data

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.5.2
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tsibble     1.1.6     ✔ feasts      0.4.2
## ✔ tsibbledata 0.4.1     ✔ fable       0.5.0
## Warning: package 'tsibble' was built under R version 4.5.2
## Warning: package 'tsibbledata' was built under R version 4.5.2
## Warning: package 'feasts' was built under R version 4.5.2
## Warning: package 'fabletools' was built under R version 4.5.2
## Warning: package 'fable' was built under R version 4.5.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
pipes_1 <- read.csv('https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/Waterflow_Pipe1.csv', header = TRUE)

glimpse(pipes_1)
## Rows: 1,000
## Columns: 2
## $ Date.Time <chr> "10/23/15 12:24 AM", "10/23/15 12:40 AM", "10/23/15 12:53 AM…
## $ WaterFlow <dbl> 23.369599, 28.002881, 23.065895, 29.972809, 5.997953, 15.935…
pipes_2 <- read.csv('https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/Waterflow_Pipe2.csv', header = TRUE)

glimpse(pipes_2)
## Rows: 1,000
## Columns: 2
## $ Date.Time <chr> "10/23/15 1:00 AM", "10/23/15 2:00 AM", "10/23/15 3:00 AM", …
## $ WaterFlow <dbl> 18.810791, 43.087025, 37.987705, 36.120379, 31.851259, 28.23…

Converting characters to dates

Did I think this imported correctly because the column is called Date.Time? Yes, I did.

pipes_1 <- pipes_1 |>
    mutate(Date.Time = mdy_hm(Date.Time))

pipes_2 <- pipes_2 |>
    mutate(Date.Time = mdy_hm(Date.Time))

#check for blank values
sum(is.na(pipes_1))
## [1] 0
sum(is.na(pipes_2))
## [1] 0

Dealing with duplicates

duplicates(pipes_1)
## Using `Date.Time` as index variable.
## # A tibble: 54 × 2
##    Date.Time           WaterFlow
##    <dttm>                  <dbl>
##  1 2015-10-23 06:33:00      30.4
##  2 2015-10-23 06:33:00      26.2
##  3 2015-10-23 19:58:00      22.7
##  4 2015-10-23 19:58:00      19.4
##  5 2015-10-23 20:28:00      21.7
##  6 2015-10-23 20:28:00      25.8
##  7 2015-10-24 04:07:00      37.5
##  8 2015-10-24 04:07:00      22.4
##  9 2015-10-24 13:50:00      12.5
## 10 2015-10-24 13:50:00      13.8
## # ℹ 44 more rows
#no duplicates - the doesn't mean there aren't any duplicates between the two data sets
duplicates(pipes_2)
## Using `Date.Time` as index variable.
## # A tibble: 0 × 2
## # ℹ 2 variables: Date.Time <dttm>, WaterFlow <dbl>
#We can calculate the average by grouping

pipes_1_dr <- pipes_1 |>
  group_by(Date.Time) |>
  summarize(WaterFlow = mean(WaterFlow, na.rm = TRUE)) |>
  ungroup()

duplicates(pipes_1_dr)
## Using `Date.Time` as index variable.
## # A tibble: 0 × 2
## # ℹ 2 variables: Date.Time <dttm>, WaterFlow <dbl>
pipes_1_t <- pipes_1_dr |> 
  as_tsibble(index = Date.Time)

autoplot(pipes_1_t)
## Plot variable not specified, automatically selected `.vars = WaterFlow`

pipes_2_t <- pipes_2 |> 
  as_tsibble(index = Date.Time)

#merge the tables (union)
all_pipes <- bind_rows(pipes_1_dr, pipes_2)

#average the duplicates (I could have waited and only done this step once)
duplicates(all_pipes)
## Using `Date.Time` as index variable.
## # A tibble: 34 × 2
##    Date.Time           WaterFlow
##    <dttm>                  <dbl>
##  1 2015-10-23 09:00:00      6.21
##  2 2015-10-24 10:00:00     12.3 
##  3 2015-10-24 13:00:00     29.7 
##  4 2015-10-25 15:00:00     16.9 
##  5 2015-10-26 02:00:00     18.0 
##  6 2015-10-26 09:00:00     24.1 
##  7 2015-10-27 18:00:00     22.1 
##  8 2015-10-28 11:00:00     24.2 
##  9 2015-10-29 09:00:00     20.1 
## 10 2015-10-29 19:00:00     34.2 
## # ℹ 24 more rows
all_pipes_dr <- all_pipes |>
  group_by(Date.Time) |>
  summarize(WaterFlow = mean(WaterFlow, na.rm = TRUE)) |>
  ungroup()

#as a tsibble
all_pipes_t <- all_pipes_dr |> 
  as_tsibble(index = Date.Time)


#autoplot
autoplot(all_pipes_t)
## Plot variable not specified, automatically selected `.vars = WaterFlow`

Cleaning hourly timestamps

There are clearly still multiple values per hour. I need to simplify the date/time stamps so they only include the hour (not minute) and find the mean.

#converts just hours
pipes_formatted <- all_pipes_dr |>
  mutate(Date.Time = floor_date(Date.Time, "hour"))

#checking duplicate values
duplicates(pipes_formatted)
## Using `Date.Time` as index variable.
## # A tibble: 1,191 × 2
##    Date.Time           WaterFlow
##    <dttm>                  <dbl>
##  1 2015-10-23 00:00:00     23.4 
##  2 2015-10-23 00:00:00     28.0 
##  3 2015-10-23 00:00:00     23.1 
##  4 2015-10-23 00:00:00     30.0 
##  5 2015-10-23 01:00:00     18.8 
##  6 2015-10-23 01:00:00      6.00
##  7 2015-10-23 01:00:00     15.9 
##  8 2015-10-23 01:00:00     26.6 
##  9 2015-10-23 01:00:00     33.3 
## 10 2015-10-23 01:00:00     12.4 
## # ℹ 1,181 more rows
#aggregating by hour
pipes_merged <- pipes_formatted |>
  group_by(Date.Time) |>
  summarize(WaterFlow = mean(WaterFlow, na.rm = TRUE)) |>
  ungroup()


#Format as tsibble
pipes_merged_t <- pipes_merged |> 
  as_tsibble(index = Date.Time)

autoplot(pipes_merged_t)
## Plot variable not specified, automatically selected `.vars = WaterFlow`

#check for gaps in the time series
gaps <- pipes_merged_t |>
  count_gaps(.full = TRUE)


print(gaps)
## # A tibble: 0 × 3
## # ℹ 3 variables: .from <dttm>, .to <dttm>, .n <int>

Why does it look like that?

This plot does look strange at first glance, but it makes sense given that pipes 1 has 1000 values between 0 and 40 between October 24 and November 2. Pipes 2 has larger values spread out over the month of November. So, the large number of smaller values in the first data set will disproportionately affect the beginning of the data set.

Are the data stationary?

pipes_merged_t |> 
  features(WaterFlow, features = guerrero)
## # A tibble: 1 × 1
##   lambda_guerrero
##             <dbl>
## 1          -0.465
#-.465

#according to nsdiffs, the data does not requre a difference
pipes_merged_t |> 
  features(WaterFlow, unitroot_nsdiffs)
## # A tibble: 1 × 1
##   nsdiffs
##     <int>
## 1       0
#data is not stationary
pipes_merged_t |> 
  features(WaterFlow, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      4.08        0.01
#adding the box-cox - it's still not stationary
  
pipes_merged_t |> 
  features(box_cox(WaterFlow, -0.465), unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      2.13        0.01
#adding a single difference works (it's hard to tell if there's a daily pattern)
pipes_merged_t |> 
  features(box_cox(WaterFlow, -0.465) |> difference(), unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1   0.00517         0.1
#let's see if there's a discernible pattern 
dcmp <- pipes_merged_t |>
  model(stl = STL(WaterFlow)) |>
  components()

#STL decomposition pulls out a weekly and daily seasonal pattern
autoplot(dcmp)

Choosing forecasting methods

Given the instructions (stationary data), the assignment focuses on an ARIMA forecast. We can also try a simple seasonal naive forecast (there’s some daily/weekly seasonality). I will try a couple of ETS methods, though ETS may not be a great fit for data with a daily/weekly seasonal pattern (as we see below).

Seasonal Naive

#I'll use 30 days of data for training.
#data set goes from oct 23 2015 to dec 3 -- that's abotu 40 days. 80% = 32 days.
myseries_train <- pipes_merged_t |>
  filter(Date.Time < as_datetime("2015-11-23 00:00:00"))

fit_snaive <- myseries_train |>
  model(snaive = SNAIVE(WaterFlow))

fc_snaive <- fit_snaive |>
  forecast(h = "11 days")

#confidence interval
fc_snaive |>
  autoplot(pipes_merged_t)

#no confidence interval
fc_snaive |>
  autoplot(pipes_merged_t, level = NULL)

#it's underestimating the new (bigger) spikes for winter 

fit_snaive |>
  accuracy()
## # A tibble: 1 × 10
##   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 snaive Training 0.503  18.5  14.1 -21.8  53.7     1     1 0.0176

The RMSE is 18.5, which seems pretty high. Visually, it doesn’t look like a great mactchup.

ARIMA

#auto-ARIMA - not stepwise (takes a while)
pipes_merged_t |>
  model(auto_arima = ARIMA(box_cox(WaterFlow, -0.465), stepwise = FALSE)) |>
  report()
## Series: WaterFlow 
## Model: ARIMA(0,1,1) 
## Transformation: box_cox(WaterFlow, `-0.465`) 
## 
## Coefficients:
##           ma1
##       -0.9785
## s.e.   0.0066
## 
## sigma^2 estimated as 0.01425:  log likelihood=705.61
## AIC=-1407.22   AICc=-1407.21   BIC=-1397.41
#011

#stepwise should give the same answer
pipes_merged_t |>
  model(auto_arima = ARIMA(box_cox(WaterFlow, -0.465), stepwise = TRUE)) |>
  report()
## Series: WaterFlow 
## Model: ARIMA(0,1,1) 
## Transformation: box_cox(WaterFlow, `-0.465`) 
## 
## Coefficients:
##           ma1
##       -0.9785
## s.e.   0.0066
## 
## sigma^2 estimated as 0.01425:  log likelihood=705.61
## AIC=-1407.22   AICc=-1407.21   BIC=-1397.41
#it does

#ACF and PACF to check for other options
pipes_merged_t |>
  gg_tsdisplay(difference(box_cox(WaterFlow, -0.465)), plot_type = 'partial')
## Warning: `gg_tsdisplay()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsdisplay()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#Slow decay in the PACF - we can also try 012 for the significant spike at lag 2
#and multiple spikes


fit_pipes <-  myseries_train |>
  model(
    auto_arima = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,1)),
    arima012 = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,2)),
    seasonal_arima = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,1) + PDQ(0,1,1)),
    seasonal_arima_2 = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,2) + PDQ(0,1,1))
  )

I plotted the first two, which looked very off, then went back and decided to add a seasonal component to capture some level of seasonality.

fc <- fit_pipes |> 
  forecast(h = "10 days")


fc |> autoplot(pipes_merged_t)

#facet-wrap
fc |> autoplot(pipes_merged_t) +
     facet_wrap(~ .model, ncol = 1) 

#remove confidence intervals - none of these look like a particularly good fit
fc |> autoplot(pipes_merged_t, level = NULL) +
     facet_wrap(~ .model, ncol = 1) 

#zoom in on the auto-Arima - which is more or less a straight line
fc |> 
    filter(.model == "auto_arima") |> 
    autoplot(pipes_merged_t, level = NULL)

#zoom in on seasonal arima
fc |> 
    filter(.model == "seasonal_arima") |> 
    autoplot(pipes_merged_t, level = NULL)

#check the accuracy
fc |> 
  accuracy(pipes_merged_t)
## # A tibble: 4 × 10
##   .model           .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>            <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 arima012         Test   -6.79  17.6  14.5 -65.3  77.9  1.03 0.951 -0.0174
## 2 auto_arima       Test   -6.67  17.6  14.5 -64.9  77.6  1.03 0.949 -0.0176
## 3 seasonal_arima   Test  -13.5   22.2  18.2 -87.1  94.9  1.29 1.20   0.0100
## 4 seasonal_arima_2 Test  -13.5   22.2  18.2 -87.3  95.1  1.30 1.20   0.0101
#using a 1 vs 2 q value didn't make much of a difference (none for RMSE)
#the automatic models are technically a better fit, but donn't capture any of the 
#fluctuations

#residuals on all four

#distribuitons all look left-skewed
fit_pipes |> 
  select(auto_arima) |> 
  gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fit_pipes |> 
  select(arima012) |> 
  gg_tsresiduals()

fit_pipes |> 
  select(seasonal_arima) |> 
  gg_tsresiduals()

fit_pipes |> 
  select(seasonal_arima_2) |> 
  gg_tsresiduals()

Auto-ARIMA and non-seasonal ARIMA are technically a better fit according to residuals, but they capture none of the seasonality.

ETS

fit <- pipes_merged_t |>
    model(ets = ETS(WaterFlow))

report(fit)
## Series: WaterFlow 
## Model: ETS(M,N,N) 
##   Smoothing parameters:
##     alpha = 0.03738732 
## 
##   Initial states:
##      l[0]
##  20.60313
## 
##   sigma^2:  0.1585
## 
##      AIC     AICc      BIC 
## 12170.65 12170.67 12185.37
#MNN

multiplicative error, no trend, no seasonality

ETS is a less-great model for weekly/daily seasonality, but we could try it with additive seasonality and trend, and see how that goes (MAA or MAdA).

fit_ets <- myseries_train |>
  model( 
        MNN = ETS(WaterFlow ~ error("M") + trend("N") + season("N")),
        MAA = ETS(WaterFlow ~ error("M") + trend("A") + season("A")),
        MAdA = ETS(WaterFlow ~ error("M") + trend("Ad") + season("A")),
        
        )

fit_ets |> 
  forecast(h = "11 days") |> 
  autoplot(pipes_merged_t)

#all plots separately 
fit_ets |> 
    forecast(h = "11 days") |> 
    autoplot(pipes_merged_t) +
    facet_wrap(~ .model, ncol = 1, scales = "free_y") 

#without the confidence interval
fit_ets |> 
    forecast(h = "11 days") |> 
    autoplot(pipes_merged_t, level = NULL) +
    facet_wrap(~ .model, ncol = 1, scales = "free_y") 

#check the accurcy
fit_ets |> accuracy()
## # A tibble: 3 × 10
##   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 MNN    Training 0.604  13.8  10.3 -22.2  43.3 0.735 0.746 -0.0363
## 2 MAA    Training 0.203  13.6  10.3 -22.7  43.3 0.729 0.737 -0.0173
## 3 MAdA   Training 0.302  13.6  10.3 -22.5  43.0 0.730 0.735 -0.0163

ETS has the lowest residuals of any of the models we’ve tried. The best fits are MAdA and MAA. I will try one more thing to make sure the trend is recognized by the model (specifying the period). I will also try a multiplicative seasonality.

fit_ets_2 <- myseries_train |>
  model( 
        MAA = ETS(WaterFlow ~ error("M") + trend("A") + season("M", period = 24)),
        MAdA = ETS(WaterFlow ~ error("M") + trend("Ad") + season("M", period = 24)),
        
        )

fit_ets_2 |> 
  forecast(h = "11 days") |> 
  autoplot(pipes_merged_t)

#all plots separately 
fit_ets_2 |> 
    forecast(h = "11 days") |> 
    autoplot(pipes_merged_t) +
    facet_wrap(~ .model, ncol = 1, scales = "free_y") 

#without the confidence interval
fit_ets_2 |> 
    forecast(h = "11 days") |> 
    autoplot(pipes_merged_t, level = NULL) +
    facet_wrap(~ .model, ncol = 1, scales = "free_y") 

#check the accurcy
fit_ets_2 |> accuracy()
## # A tibble: 2 × 10
##   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 MAA    Training 0.254  13.7  10.2 -22.8  43.4 0.727 0.740 -0.0192
## 2 MAdA   Training 0.349  13.7  10.3 -22.8  43.4 0.731 0.738 -0.0154

This doesn’t change much – the residuals are actually somewhat worse (somewhat better in some places, but not a ton of movement).

Forecasting

Here, I’ll produce one-week forecasts for some of the top models. Seasonal naive was the only model to pick up on the trend, and it’s my pick for producing a forecast even though, unlike all the other forecasts, the confidence interval extends past zero, which is impossible.

fit_all <- pipes_merged_t |>
  model(
    snaive = SNAIVE(WaterFlow),
    MAA = ETS(WaterFlow ~ error("M") + trend("A") + season("A")),
    MAdA = ETS(WaterFlow ~ error("M") + trend("Ad") + season("A")),
    seasonal_arima = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,1) + PDQ(0,1,1))
  )

#Forecasts for one week
fc_all <- fit_all |> 
  forecast(h = "1 week")

#all forecasts together 
fc_all |> 
  autoplot(pipes_merged_t) + 
  labs(title = "All forecasts, Water Usage")

fc_all |> 
  autoplot(pipes_merged_t, level = NULL) + 
  labs(title = "All forecasts, Water Usage (no CI)")

fc_all |> 
  autoplot(pipes_merged_t) +
  facet_wrap(~ .model, ncol = 1)

#breaking these up 

fc_all |> 
  filter(.model %in% c("MAA", "MAdA")) |> 
  autoplot(pipes_merged_t) +
  facet_wrap(~ .model, ncol = 1) + 
  labs(title = "ETS")

fc_all |> 
  filter(.model %in% c("MAA", "MAdA")) |> 
  autoplot(pipes_merged_t, level = NULL) +
  facet_wrap(~ .model, ncol = 1) + 
  labs(title = "ETS")

fc_all |> 
  filter(.model == "seasonal_arima") |> 
  autoplot(pipes_merged_t) +
  facet_wrap(~ .model, ncol = 1)+ 
  labs(title = "ARIMA")

fc_all |> 
  filter(.model == "snaive") |> 
  autoplot(pipes_merged_t) + 
  labs(title = "SNAIVE")

#create a data frame
water_forecast <- as.data.frame(fc_all)

#create CSV
#write.csv(water_forecast, file="waterforecast.csv", row.names=TRUE)
water_forecast_naive <- water_forecast |>
  filter(.model == "snaive")

#write.csv(water_forecast_naive, file="waterforecastnaive.csv", row.names=TRUE)