Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour. Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.5.2
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tsibble 1.1.6 ✔ feasts 0.4.2
## ✔ tsibbledata 0.4.1 ✔ fable 0.5.0
## Warning: package 'tsibble' was built under R version 4.5.2
## Warning: package 'tsibbledata' was built under R version 4.5.2
## Warning: package 'feasts' was built under R version 4.5.2
## Warning: package 'fabletools' was built under R version 4.5.2
## Warning: package 'fable' was built under R version 4.5.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
pipes_1 <- read.csv('https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/Waterflow_Pipe1.csv', header = TRUE)
glimpse(pipes_1)
## Rows: 1,000
## Columns: 2
## $ Date.Time <chr> "10/23/15 12:24 AM", "10/23/15 12:40 AM", "10/23/15 12:53 AM…
## $ WaterFlow <dbl> 23.369599, 28.002881, 23.065895, 29.972809, 5.997953, 15.935…
pipes_2 <- read.csv('https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/Waterflow_Pipe2.csv', header = TRUE)
glimpse(pipes_2)
## Rows: 1,000
## Columns: 2
## $ Date.Time <chr> "10/23/15 1:00 AM", "10/23/15 2:00 AM", "10/23/15 3:00 AM", …
## $ WaterFlow <dbl> 18.810791, 43.087025, 37.987705, 36.120379, 31.851259, 28.23…
Did I think this imported correctly because the column is called Date.Time? Yes, I did.
pipes_1 <- pipes_1 |>
mutate(Date.Time = mdy_hm(Date.Time))
pipes_2 <- pipes_2 |>
mutate(Date.Time = mdy_hm(Date.Time))
#check for blank values
sum(is.na(pipes_1))
## [1] 0
sum(is.na(pipes_2))
## [1] 0
duplicates(pipes_1)
## Using `Date.Time` as index variable.
## # A tibble: 54 × 2
## Date.Time WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 06:33:00 30.4
## 2 2015-10-23 06:33:00 26.2
## 3 2015-10-23 19:58:00 22.7
## 4 2015-10-23 19:58:00 19.4
## 5 2015-10-23 20:28:00 21.7
## 6 2015-10-23 20:28:00 25.8
## 7 2015-10-24 04:07:00 37.5
## 8 2015-10-24 04:07:00 22.4
## 9 2015-10-24 13:50:00 12.5
## 10 2015-10-24 13:50:00 13.8
## # ℹ 44 more rows
#no duplicates - the doesn't mean there aren't any duplicates between the two data sets
duplicates(pipes_2)
## Using `Date.Time` as index variable.
## # A tibble: 0 × 2
## # ℹ 2 variables: Date.Time <dttm>, WaterFlow <dbl>
#We can calculate the average by grouping
pipes_1_dr <- pipes_1 |>
group_by(Date.Time) |>
summarize(WaterFlow = mean(WaterFlow, na.rm = TRUE)) |>
ungroup()
duplicates(pipes_1_dr)
## Using `Date.Time` as index variable.
## # A tibble: 0 × 2
## # ℹ 2 variables: Date.Time <dttm>, WaterFlow <dbl>
pipes_1_t <- pipes_1_dr |>
as_tsibble(index = Date.Time)
autoplot(pipes_1_t)
## Plot variable not specified, automatically selected `.vars = WaterFlow`
pipes_2_t <- pipes_2 |>
as_tsibble(index = Date.Time)
#merge the tables (union)
all_pipes <- bind_rows(pipes_1_dr, pipes_2)
#average the duplicates (I could have waited and only done this step once)
duplicates(all_pipes)
## Using `Date.Time` as index variable.
## # A tibble: 34 × 2
## Date.Time WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 09:00:00 6.21
## 2 2015-10-24 10:00:00 12.3
## 3 2015-10-24 13:00:00 29.7
## 4 2015-10-25 15:00:00 16.9
## 5 2015-10-26 02:00:00 18.0
## 6 2015-10-26 09:00:00 24.1
## 7 2015-10-27 18:00:00 22.1
## 8 2015-10-28 11:00:00 24.2
## 9 2015-10-29 09:00:00 20.1
## 10 2015-10-29 19:00:00 34.2
## # ℹ 24 more rows
all_pipes_dr <- all_pipes |>
group_by(Date.Time) |>
summarize(WaterFlow = mean(WaterFlow, na.rm = TRUE)) |>
ungroup()
#as a tsibble
all_pipes_t <- all_pipes_dr |>
as_tsibble(index = Date.Time)
#autoplot
autoplot(all_pipes_t)
## Plot variable not specified, automatically selected `.vars = WaterFlow`
There are clearly still multiple values per hour. I need to simplify the date/time stamps so they only include the hour (not minute) and find the mean.
#converts just hours
pipes_formatted <- all_pipes_dr |>
mutate(Date.Time = floor_date(Date.Time, "hour"))
#checking duplicate values
duplicates(pipes_formatted)
## Using `Date.Time` as index variable.
## # A tibble: 1,191 × 2
## Date.Time WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 00:00:00 23.4
## 2 2015-10-23 00:00:00 28.0
## 3 2015-10-23 00:00:00 23.1
## 4 2015-10-23 00:00:00 30.0
## 5 2015-10-23 01:00:00 18.8
## 6 2015-10-23 01:00:00 6.00
## 7 2015-10-23 01:00:00 15.9
## 8 2015-10-23 01:00:00 26.6
## 9 2015-10-23 01:00:00 33.3
## 10 2015-10-23 01:00:00 12.4
## # ℹ 1,181 more rows
#aggregating by hour
pipes_merged <- pipes_formatted |>
group_by(Date.Time) |>
summarize(WaterFlow = mean(WaterFlow, na.rm = TRUE)) |>
ungroup()
#Format as tsibble
pipes_merged_t <- pipes_merged |>
as_tsibble(index = Date.Time)
autoplot(pipes_merged_t)
## Plot variable not specified, automatically selected `.vars = WaterFlow`
#check for gaps in the time series
gaps <- pipes_merged_t |>
count_gaps(.full = TRUE)
print(gaps)
## # A tibble: 0 × 3
## # ℹ 3 variables: .from <dttm>, .to <dttm>, .n <int>
This plot does look strange at first glance, but it makes sense given that pipes 1 has 1000 values between 0 and 40 between October 24 and November 2. Pipes 2 has larger values spread out over the month of November. So, the large number of smaller values in the first data set will disproportionately affect the beginning of the data set.
pipes_merged_t |>
features(WaterFlow, features = guerrero)
## # A tibble: 1 × 1
## lambda_guerrero
## <dbl>
## 1 -0.465
#-.465
#according to nsdiffs, the data does not requre a difference
pipes_merged_t |>
features(WaterFlow, unitroot_nsdiffs)
## # A tibble: 1 × 1
## nsdiffs
## <int>
## 1 0
#data is not stationary
pipes_merged_t |>
features(WaterFlow, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 4.08 0.01
#adding the box-cox - it's still not stationary
pipes_merged_t |>
features(box_cox(WaterFlow, -0.465), unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 2.13 0.01
#adding a single difference works (it's hard to tell if there's a daily pattern)
pipes_merged_t |>
features(box_cox(WaterFlow, -0.465) |> difference(), unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.00517 0.1
#let's see if there's a discernible pattern
dcmp <- pipes_merged_t |>
model(stl = STL(WaterFlow)) |>
components()
#STL decomposition pulls out a weekly and daily seasonal pattern
autoplot(dcmp)
Given the instructions (stationary data), the assignment focuses on an ARIMA forecast. We can also try a simple seasonal naive forecast (there’s some daily/weekly seasonality). I will try a couple of ETS methods, though ETS may not be a great fit for data with a daily/weekly seasonal pattern (as we see below).
#I'll use 30 days of data for training.
#data set goes from oct 23 2015 to dec 3 -- that's abotu 40 days. 80% = 32 days.
myseries_train <- pipes_merged_t |>
filter(Date.Time < as_datetime("2015-11-23 00:00:00"))
fit_snaive <- myseries_train |>
model(snaive = SNAIVE(WaterFlow))
fc_snaive <- fit_snaive |>
forecast(h = "11 days")
#confidence interval
fc_snaive |>
autoplot(pipes_merged_t)
#no confidence interval
fc_snaive |>
autoplot(pipes_merged_t, level = NULL)
#it's underestimating the new (bigger) spikes for winter
fit_snaive |>
accuracy()
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 snaive Training 0.503 18.5 14.1 -21.8 53.7 1 1 0.0176
The RMSE is 18.5, which seems pretty high. Visually, it doesn’t look like a great mactchup.
#auto-ARIMA - not stepwise (takes a while)
pipes_merged_t |>
model(auto_arima = ARIMA(box_cox(WaterFlow, -0.465), stepwise = FALSE)) |>
report()
## Series: WaterFlow
## Model: ARIMA(0,1,1)
## Transformation: box_cox(WaterFlow, `-0.465`)
##
## Coefficients:
## ma1
## -0.9785
## s.e. 0.0066
##
## sigma^2 estimated as 0.01425: log likelihood=705.61
## AIC=-1407.22 AICc=-1407.21 BIC=-1397.41
#011
#stepwise should give the same answer
pipes_merged_t |>
model(auto_arima = ARIMA(box_cox(WaterFlow, -0.465), stepwise = TRUE)) |>
report()
## Series: WaterFlow
## Model: ARIMA(0,1,1)
## Transformation: box_cox(WaterFlow, `-0.465`)
##
## Coefficients:
## ma1
## -0.9785
## s.e. 0.0066
##
## sigma^2 estimated as 0.01425: log likelihood=705.61
## AIC=-1407.22 AICc=-1407.21 BIC=-1397.41
#it does
#ACF and PACF to check for other options
pipes_merged_t |>
gg_tsdisplay(difference(box_cox(WaterFlow, -0.465)), plot_type = 'partial')
## Warning: `gg_tsdisplay()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsdisplay()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
#Slow decay in the PACF - we can also try 012 for the significant spike at lag 2
#and multiple spikes
fit_pipes <- myseries_train |>
model(
auto_arima = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,1)),
arima012 = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,2)),
seasonal_arima = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,1) + PDQ(0,1,1)),
seasonal_arima_2 = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,2) + PDQ(0,1,1))
)
I plotted the first two, which looked very off, then went back and decided to add a seasonal component to capture some level of seasonality.
fc <- fit_pipes |>
forecast(h = "10 days")
fc |> autoplot(pipes_merged_t)
#facet-wrap
fc |> autoplot(pipes_merged_t) +
facet_wrap(~ .model, ncol = 1)
#remove confidence intervals - none of these look like a particularly good fit
fc |> autoplot(pipes_merged_t, level = NULL) +
facet_wrap(~ .model, ncol = 1)
#zoom in on the auto-Arima - which is more or less a straight line
fc |>
filter(.model == "auto_arima") |>
autoplot(pipes_merged_t, level = NULL)
#zoom in on seasonal arima
fc |>
filter(.model == "seasonal_arima") |>
autoplot(pipes_merged_t, level = NULL)
#check the accuracy
fc |>
accuracy(pipes_merged_t)
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima012 Test -6.79 17.6 14.5 -65.3 77.9 1.03 0.951 -0.0174
## 2 auto_arima Test -6.67 17.6 14.5 -64.9 77.6 1.03 0.949 -0.0176
## 3 seasonal_arima Test -13.5 22.2 18.2 -87.1 94.9 1.29 1.20 0.0100
## 4 seasonal_arima_2 Test -13.5 22.2 18.2 -87.3 95.1 1.30 1.20 0.0101
#using a 1 vs 2 q value didn't make much of a difference (none for RMSE)
#the automatic models are technically a better fit, but donn't capture any of the
#fluctuations
#residuals on all four
#distribuitons all look left-skewed
fit_pipes |>
select(auto_arima) |>
gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fit_pipes |>
select(arima012) |>
gg_tsresiduals()
fit_pipes |>
select(seasonal_arima) |>
gg_tsresiduals()
fit_pipes |>
select(seasonal_arima_2) |>
gg_tsresiduals()
Auto-ARIMA and non-seasonal ARIMA are technically a better fit according to residuals, but they capture none of the seasonality.
fit <- pipes_merged_t |>
model(ets = ETS(WaterFlow))
report(fit)
## Series: WaterFlow
## Model: ETS(M,N,N)
## Smoothing parameters:
## alpha = 0.03738732
##
## Initial states:
## l[0]
## 20.60313
##
## sigma^2: 0.1585
##
## AIC AICc BIC
## 12170.65 12170.67 12185.37
#MNN
multiplicative error, no trend, no seasonality
ETS is a less-great model for weekly/daily seasonality, but we could try it with additive seasonality and trend, and see how that goes (MAA or MAdA).
fit_ets <- myseries_train |>
model(
MNN = ETS(WaterFlow ~ error("M") + trend("N") + season("N")),
MAA = ETS(WaterFlow ~ error("M") + trend("A") + season("A")),
MAdA = ETS(WaterFlow ~ error("M") + trend("Ad") + season("A")),
)
fit_ets |>
forecast(h = "11 days") |>
autoplot(pipes_merged_t)
#all plots separately
fit_ets |>
forecast(h = "11 days") |>
autoplot(pipes_merged_t) +
facet_wrap(~ .model, ncol = 1, scales = "free_y")
#without the confidence interval
fit_ets |>
forecast(h = "11 days") |>
autoplot(pipes_merged_t, level = NULL) +
facet_wrap(~ .model, ncol = 1, scales = "free_y")
#check the accurcy
fit_ets |> accuracy()
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MNN Training 0.604 13.8 10.3 -22.2 43.3 0.735 0.746 -0.0363
## 2 MAA Training 0.203 13.6 10.3 -22.7 43.3 0.729 0.737 -0.0173
## 3 MAdA Training 0.302 13.6 10.3 -22.5 43.0 0.730 0.735 -0.0163
ETS has the lowest residuals of any of the models we’ve tried. The best fits are MAdA and MAA. I will try one more thing to make sure the trend is recognized by the model (specifying the period). I will also try a multiplicative seasonality.
fit_ets_2 <- myseries_train |>
model(
MAA = ETS(WaterFlow ~ error("M") + trend("A") + season("M", period = 24)),
MAdA = ETS(WaterFlow ~ error("M") + trend("Ad") + season("M", period = 24)),
)
fit_ets_2 |>
forecast(h = "11 days") |>
autoplot(pipes_merged_t)
#all plots separately
fit_ets_2 |>
forecast(h = "11 days") |>
autoplot(pipes_merged_t) +
facet_wrap(~ .model, ncol = 1, scales = "free_y")
#without the confidence interval
fit_ets_2 |>
forecast(h = "11 days") |>
autoplot(pipes_merged_t, level = NULL) +
facet_wrap(~ .model, ncol = 1, scales = "free_y")
#check the accurcy
fit_ets_2 |> accuracy()
## # A tibble: 2 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MAA Training 0.254 13.7 10.2 -22.8 43.4 0.727 0.740 -0.0192
## 2 MAdA Training 0.349 13.7 10.3 -22.8 43.4 0.731 0.738 -0.0154
This doesn’t change much – the residuals are actually somewhat worse (somewhat better in some places, but not a ton of movement).
Here, I’ll produce one-week forecasts for some of the top models. Seasonal naive was the only model to pick up on the trend, and it’s my pick for producing a forecast even though, unlike all the other forecasts, the confidence interval extends past zero, which is impossible.
fit_all <- pipes_merged_t |>
model(
snaive = SNAIVE(WaterFlow),
MAA = ETS(WaterFlow ~ error("M") + trend("A") + season("A")),
MAdA = ETS(WaterFlow ~ error("M") + trend("Ad") + season("A")),
seasonal_arima = ARIMA(box_cox(WaterFlow, -0.465) ~ pdq(0,1,1) + PDQ(0,1,1))
)
#Forecasts for one week
fc_all <- fit_all |>
forecast(h = "1 week")
#all forecasts together
fc_all |>
autoplot(pipes_merged_t) +
labs(title = "All forecasts, Water Usage")
fc_all |>
autoplot(pipes_merged_t, level = NULL) +
labs(title = "All forecasts, Water Usage (no CI)")
fc_all |>
autoplot(pipes_merged_t) +
facet_wrap(~ .model, ncol = 1)
#breaking these up
fc_all |>
filter(.model %in% c("MAA", "MAdA")) |>
autoplot(pipes_merged_t) +
facet_wrap(~ .model, ncol = 1) +
labs(title = "ETS")
fc_all |>
filter(.model %in% c("MAA", "MAdA")) |>
autoplot(pipes_merged_t, level = NULL) +
facet_wrap(~ .model, ncol = 1) +
labs(title = "ETS")
fc_all |>
filter(.model == "seasonal_arima") |>
autoplot(pipes_merged_t) +
facet_wrap(~ .model, ncol = 1)+
labs(title = "ARIMA")
fc_all |>
filter(.model == "snaive") |>
autoplot(pipes_merged_t) +
labs(title = "SNAIVE")
#create a data frame
water_forecast <- as.data.frame(fc_all)
#create CSV
#write.csv(water_forecast, file="waterforecast.csv", row.names=TRUE)
water_forecast_naive <- water_forecast |>
filter(.model == "snaive")
#write.csv(water_forecast_naive, file="waterforecastnaive.csv", row.names=TRUE)