library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tsibble 1.1.5 ✔ fable 0.3.4
## ✔ tsibbledata 0.4.1 ✔ fabletools 0.4.2
## ✔ feasts 0.3.2
## Warning: package 'tsibble' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(readxl)
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.3.3
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual .rmd file. Also please submit the forecast which you will put in an Excel readable file.
# Import and inspect data
atm_data <- read_xlsx('ATM624Data.xlsx')
print(head(atm_data))
## # A tibble: 6 × 3
## DATE ATM Cash
## <dbl> <chr> <dbl>
## 1 39934 ATM1 96
## 2 39934 ATM2 107
## 3 39935 ATM1 82
## 4 39935 ATM2 89
## 5 39936 ATM1 85
## 6 39936 ATM2 90
print(tail(atm_data))
## # A tibble: 6 × 3
## DATE ATM Cash
## <dbl> <chr> <dbl>
## 1 40293 ATM4 542.
## 2 40294 ATM4 404.
## 3 40295 ATM4 13.7
## 4 40296 ATM4 348.
## 5 40297 ATM4 44.2
## 6 40298 ATM4 482.
# Convert the Date to a usable format, and convert the tibble to a tsibble
atm_data$DATE <- as.Date(atm_data$DATE, origin = '1899-12-30')
atm_data <- atm_data |>
as_tsibble(key = 'ATM', index = 'DATE') |>
arrange(DATE)
print(head(atm_data))
## # A tsibble: 6 x 3 [1D]
## # Key: ATM [4]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-01 ATM2 107
## 3 2009-05-01 ATM3 0
## 4 2009-05-01 ATM4 777.
## 5 2009-05-02 ATM1 82
## 6 2009-05-02 ATM2 89
# Filter for ATM 1 and plot data
atm_1_ts <- atm_data |> filter(ATM == 'ATM1')
atm_1_ts |> autoplot(Cash)
This data appears to be seasonal, with a significant dropoff occurring on a regular basis. The ACF plot confirmed this finding, showing clear peaks in correlation at 7, 14, and 21 days, confirming the seasonal (weekly) nature of the data.
atm_1_ts |>
ACF(Cash) |>
autoplot()
The size of the spikes in the data are inconsistent, so a Box-Cox transformation was applied to try to smooth out the variance. However, the Box-Cox transformed data seemed to show less consistent variation, so it was not utilized.
# Find lambda
lambda_atm1 <- atm_1_ts |>
features(Cash, features = guerrero) |>
pull(lambda_guerrero)
# Apply Box-Cox transformation
atm_1_bc <- atm_1_ts |>
mutate(b_c = box_cox(Cash, lambda_atm1)) |>
select(DATE, b_c)
atm_1_bc |> autoplot(b_c)
Finally, missing values were replaced with the median of the data-set.
atm_1_ts$Cash[is.na(atm_1_ts$Cash)] <- median(atm_1_ts$Cash, na.rm=TRUE)
A test/train split was created from the data with 80% of the data points being assigned to the training set and the remaining 20% used for the test set. The following models were developed from the training set and used to predict the test set:
The model with lowest RMSE was the ETS model, so that model was used for the final forecast.
# Create Test/Train Split
atm_1_train <- atm_1_ts[1:round(.8*nrow(atm_1_ts)),]
atm_1_test <- anti_join(atm_1_ts, atm_1_train, by = 'DATE')
# Use model to predict the test data
atm_1_model <- atm_1_train |>
model('snaive' = SNAIVE(Cash),
'ets' = ETS(Cash),
'arima' = ARIMA(Cash)) |>
forecast(h = nrow(atm_1_test))
# Evaluate model accuracy
accuracy(atm_1_model, atm_1_ts) |> select(.model, RMSE) |> arrange(RMSE)
## # A tibble: 3 × 2
## .model RMSE
## <chr> <dbl>
## 1 ets 51.8
## 2 arima 54.2
## 3 snaive 62.5
The selected ETS model produces residuals that look like white noise, as they do not display a pattern or significant outliers.
atm_1_ts |>
model(ETS(Cash)) |>
gg_tsresiduals()
A 31-day forecast was generated (one full month).
# Generate forecast
atm_1_prediction <- atm_1_ts |>
model(ETS(Cash)) |>
forecast(h = 31)
atm_1_prediction |> autoplot(atm_1_ts, level = NULL)
# Produce final output
atm_1_final <- atm_1_prediction[,-c(2,4)] |>
select(2,1,3) |>
rename(Cash = .mean)
print(head(atm_1_final))
## # A tsibble: 6 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-05-01 ATM1 87.1
## 2 2010-05-02 ATM1 101.
## 3 2010-05-03 ATM1 73.1
## 4 2010-05-04 ATM1 5.74
## 5 2010-05-05 ATM1 100.
## 6 2010-05-06 ATM1 79.4
# Filter for ATM 2 and plot data
atm_2_ts <- atm_data |> filter(ATM == 'ATM2')
atm_2_ts |> autoplot(Cash)
Like with the previous ATM, this data appears to be seasonal. The ACF plot again confirms weekly seasonality, showing spikes at 7, 14, and 21 days.
atm_2_ts |>
ACF(Cash) |>
autoplot()
Next, Missing values were replaced with the median of the data set.
atm_2_ts$Cash[is.na(atm_2_ts$Cash)] <- median(atm_2_ts$Cash, na.rm=TRUE)
As with the previous ATM, the Box-Cox transformation does not appear to have made the variance meaningfully more consistent, so it was not utilized.
# Find lambda
lambda_atm2 <- atm_2_ts |>
features(Cash, features = guerrero) |>
pull(lambda_guerrero)
# Apply Box-Cox transformation
atm_2_bc <- atm_2_ts |>
mutate(b_c = box_cox(Cash, lambda_atm2)) |>
select(DATE, b_c)
atm_2_bc |> autoplot(b_c)
A test/train split was created from the data with 80% of the data points being assigned to the training set and the remaining 20% used for the test set. The following models were developed from the training set and used to predict the test set:
The model with lowest RMSE was the seasonal Naive Bayes model, so that model was used for the final forecast.
# Create Test/Train Split
atm_2_train <- atm_2_ts[1:round(.8*nrow(atm_2_ts)),]
atm_2_test <- anti_join(atm_2_ts, atm_2_train, by = 'DATE')
# Use model to predict the test data
atm_2_model <- atm_2_train |>
model('snaive' = SNAIVE(Cash),
'ets' = ETS(Cash),
'arima' = ARIMA(Cash)) |>
forecast(h = nrow(atm_2_test))
# Evaluate model accuracy
accuracy(atm_2_model, atm_2_ts) |> select(.model, RMSE) |> arrange(RMSE)
## # A tibble: 3 × 2
## .model RMSE
## <chr> <dbl>
## 1 snaive 59.7
## 2 ets 64.3
## 3 arima 64.9
The selected SNAIVE model produces nearly normal residuals with consistent variance, although the acf plot suggests that some seasonality may still not be fully captured.
atm_2_ts |>
model(SNAIVE(Cash)) |>
gg_tsresiduals()
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).
A 31-day forecast was generated (one full month).
# Generate forecast
atm_2_prediction <- atm_2_ts |>
model(SNAIVE(Cash)) |>
forecast(h = 31)
atm_2_prediction |> autoplot(atm_2_ts, level = NULL)
# Produce final output
atm_2_final <- atm_2_prediction[,-c(2,4)] |>
select(2,1,3) |>
rename(Cash = .mean)
print(head(atm_2_final))
## # A tsibble: 6 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-05-01 ATM2 61
## 2 2010-05-02 ATM2 89
## 3 2010-05-03 ATM2 11
## 4 2010-05-04 ATM2 2
## 5 2010-05-05 ATM2 107
## 6 2010-05-06 ATM2 89
# Filter for ATM 3 and plot data
atm_3_ts <- atm_data |> filter(ATM == 'ATM3')
atm_3_ts |> autoplot(Cash)
Clearly, this ATM has only been activated recently (or the provided data is incomplete). Data with a value of 0 was removed.
atm_3_ts <- atm_3_ts |> filter(Cash != 0)
atm_3_ts |> autoplot(Cash)
There are only 3 data points available for this ATM. Without additional context, the best approximation that can be made is a simple average of the existing data points.
# Generate forecast
atm_3_prediction <- atm_3_ts |>
model(MEAN(Cash)) |>
forecast(h = 31)
atm_3_prediction |> autoplot(atm_3_ts, level = NULL)
# Produce final output
atm_3_final <- atm_3_prediction[,-c(2,4)] |>
select(2,1,3) |>
rename(Cash = .mean)
print(head(atm_3_final))
## # A tsibble: 6 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-05-01 ATM3 87.7
## 2 2010-05-02 ATM3 87.7
## 3 2010-05-03 ATM3 87.7
## 4 2010-05-04 ATM3 87.7
## 5 2010-05-05 ATM3 87.7
## 6 2010-05-06 ATM3 87.7
# Filter for ATM 4 and plot data
atm_4_ts <- atm_data |> filter(ATM == 'ATM4')
atm_4_ts |> autoplot(Cash)
This data set clearly shows a single extreme outlier. This outlier, along with missing values, was replaced with the median of the data set.
atm_4_ts$Cash[atm_4_ts$Cash == max(atm_4_ts$Cash, na.rm = TRUE)] <- median(atm_4_ts$Cash, na.rm = TRUE)
atm_4_ts$Cash[is.na(atm_4_ts$Cash)] <- median(atm_4_ts$Cash, na.rm=TRUE)
atm_4_ts |> autoplot(Cash)
The ACF plot for this data set confirmed the same weekly seasonality shown in ATM1 and ATM2.
atm_4_ts |>
ACF(Cash) |>
autoplot()
As with ATM1 and ATM2, a Box-Cox transformation did not make the size of the variations more consistent, so it was not included.
# Find lambda
lambda_atm4 <- atm_4_ts |>
features(Cash, features = guerrero) |>
pull(lambda_guerrero)
# Apply Box-Cox transformation
atm_4_bc <- atm_4_ts |>
mutate(b_c = box_cox(Cash, lambda_atm4)) |>
select(DATE, b_c)
atm_4_bc |> autoplot(b_c)
A test/train split was created from the data with 80% of the data points being assigned to the training set and the remaining 20% used for the test set. The following models were developed from the training set and used to predict the test set:
The model with lowest RMSE was the ARIMA model, so that model was tried for the final forecast.
# Create Test/Train Split
atm_4_train <- atm_4_ts[1:round(.8*nrow(atm_4_ts)),]
atm_4_test <- anti_join(atm_4_ts, atm_4_train, by = 'DATE')
# Use model to predict the test data
atm_4_model <- atm_4_train |>
model('snaive' = SNAIVE(Cash),
'ets' = ETS(Cash),
'arima' = ARIMA(Cash)) |>
forecast(h = nrow(atm_4_test))
# Evaluate model accuracy
accuracy(atm_4_model, atm_4_ts) |> select(.model, RMSE) |> arrange(RMSE)
## # A tibble: 3 × 2
## .model RMSE
## <chr> <dbl>
## 1 arima 321.
## 2 ets 395.
## 3 snaive 551.
The selected ARIMA model does not produce normal residuals, indicating that another model might be a better fit.
atm_4_ts |>
model(ARIMA(Cash)) |>
gg_tsresiduals()
# Generate forecast
atm_4_prediction <- atm_4_ts |>
model(ARIMA(Cash)) |>
forecast(h = 31)
atm_4_prediction |> autoplot(atm_4_ts, level = NULL)
It is clear from the plot that this forecast is not appropriate for this data set. I tested the next-lowest RMSE model (ETS). However, this model also produced an odd-looking forecast.
# Generate forecast
atm_4_prediction <- atm_4_ts |>
model(ETS(Cash)) |>
forecast(h = 31)
atm_4_prediction |> autoplot(atm_4_ts, level = NULL)
Finally, I tried using the seasonal Naive Bayes model to produce a forecast. This produced the most appropriate looking forecast of the three models, so it was used for the final forecast.
# Generate forecast
atm_4_prediction <- atm_4_ts |>
model(SNAIVE(Cash)) |>
forecast(h = 31)
atm_4_prediction |> autoplot(atm_4_ts, level = NULL)
# Produce final output
atm_4_final <- atm_4_prediction[,-c(2,4)] |>
select(2,1,3) |>
rename(Cash = .mean)
print(atm_4_final)
## # A tsibble: 31 x 3 [1D]
## # Key: ATM [1]
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-05-01 ATM4 5.45
## 2 2010-05-02 ATM4 542.
## 3 2010-05-03 ATM4 404.
## 4 2010-05-04 ATM4 13.7
## 5 2010-05-05 ATM4 348.
## 6 2010-05-06 ATM4 44.2
## 7 2010-05-07 ATM4 482.
## 8 2010-05-08 ATM4 5.45
## 9 2010-05-09 ATM4 542.
## 10 2010-05-10 ATM4 404.
## # ℹ 21 more rows
Additionally, this model’s residual diagnostics are the closest to white noise.
atm_4_ts |>
model(SNAIVE(Cash)) |>
gg_tsresiduals()
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Combine predictions for all four ATMs into a single Data Frame
full_atm_predictions <- rbind(as_tibble(atm_1_final), as_tibble(atm_2_final), as_tibble(atm_3_final), as_tibble(atm_4_final)) |> arrange(DATE)
# Create wide-format predictions
atm_wide <- full_atm_predictions |> pivot_wider(names_from = ATM, values_from = Cash)
# Write predictions to CSV files
write.csv(full_atm_predictions, 'myrianthopoulos_atm_forecast_long.csv')
write.csv(atm_wide, 'myrianthopoulos_atm_forecast_wide.csv')
# Inspect results
print(head(full_atm_predictions))
## # A tibble: 6 × 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2010-05-01 ATM1 87.1
## 2 2010-05-01 ATM2 61
## 3 2010-05-01 ATM3 87.7
## 4 2010-05-01 ATM4 5.45
## 5 2010-05-02 ATM1 101.
## 6 2010-05-02 ATM2 89
print(head(atm_wide))
## # A tibble: 6 × 5
## DATE ATM1 ATM2 ATM3 ATM4
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2010-05-01 87.1 61 87.7 5.45
## 2 2010-05-02 101. 89 87.7 542.
## 3 2010-05-03 73.1 11 87.7 404.
## 4 2010-05-04 5.74 2 87.7 13.7
## 5 2010-05-05 100. 107 87.7 348.
## 6 2010-05-06 79.4 89 87.7 44.2
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
power_data <- read_xlsx('ResidentialCustomerForecastLoad-624.xlsx')
print(head(power_data))
## # A tibble: 6 × 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
print(tail(power_data))
## # A tibble: 6 × 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 919 2013-Jul 8415321
## 2 920 2013-Aug 9080226
## 3 921 2013-Sep 7968220
## 4 922 2013-Oct 5759367
## 5 923 2013-Nov 5769083
## 6 924 2013-Dec 9606304
power_data$`YYYY-MMM` <- yearmonth(power_data$`YYYY-MMM`)
power_data <- power_data |>
rename(Month = `YYYY-MMM`) |>
select(Month, KWH) |>
as_tsibble(index = 'Month')
print(head(power_data))
## # A tsibble: 6 x 2 [1M]
## Month KWH
## <mth> <dbl>
## 1 1998 Jan 6862583
## 2 1998 Feb 5838198
## 3 1998 Mar 5420658
## 4 1998 Apr 5010364
## 5 1998 May 4665377
## 6 1998 Jun 6467147
power_data |> autoplot(KWH)
This data appears to be seasonal, but the ACF plot was investigated to confirm this. The ACF plot showed clear peaks in correlation at 6, 12, and 18 months and clear troughs at 3, 9, 15, and 21 months, confirming the seasonal nature of the data.
power_data |>
ACF(KWH) |>
autoplot()
In addition to revealing the seasonal nature of the data, the initial plot shows a single clear outlier. This value was replaced with the median value. Missing values were replaced with the median of the data set as well.
power_data$KWH[power_data$KWH == min(power_data$KWH, na.rm = TRUE)] <- median(power_data$KWH, na.rm = TRUE)
power_data$KWH[is.na(power_data$KWH)] <- median(power_data$KWH, na.rm=TRUE)
power_data |> autoplot(KWH)
The variance here looks pretty consistent, but I checked to see if a Box-Cox transformation would help smooth it out. The plot of the Box-Cox transformed data did not look meaningfully different than the original data plot. Unnecessary data transformations should be avoided, so the Box-Cox transformation was not applied.
# Find lambda
lambda_power <- power_data |>
features(KWH, features = guerrero) |>
pull(lambda_guerrero)
# Apply Box-Cox transformation
power_data_bc <- power_data |>
mutate(b_c = box_cox(KWH, lambda_power)) |>
select(Month, b_c)
power_data_bc |> autoplot(b_c)
A test/train split was created from the data with 80% of the data points being assigned to the training set and the remaining 20% used for the test set. The following models were developed from the training set and used to predict the test set:
The model with lowest RMSE was the ARIMA model, so that model was used for the final forecast.
# Create Test/Train Split
power_data_train <- power_data[1:round(.8*nrow(power_data)),]
power_data_test <- anti_join(power_data, power_data_train, by = 'Month')
# Use model to predict the test data
power_data_model <- power_data_train |>
model('snaive' = SNAIVE(KWH),
'ets_ann' = ETS(KWH),
'arima' = ARIMA(KWH)) |>
forecast(h = nrow(power_data_test))
# Evaluate model accuracy
accuracy(power_data_model, power_data) |> select(.model, RMSE) |> arrange(RMSE)
## # A tibble: 3 × 2
## .model RMSE
## <chr> <dbl>
## 1 arima 1123229.
## 2 ets_ann 1240425.
## 3 snaive 1265484.
The selected ARIMA model’s residuals appear to be white noise, suggesting that the model is appropriate for the data.
power_data |>
model(ARIMA(KWH)) |>
gg_tsresiduals()
A 12-month forecast was generated (one full year).
# Generate forecast
kwh_prediction <- power_data |>
model(ARIMA(KWH)) |>
forecast(h = 12)
kwh_prediction |> autoplot(power_data, level = NULL)
# Produce final output
kwh_final <- kwh_prediction[,-c(1,3)] |>
rename(KWH = .mean)
print(kwh_final)
## # A tsibble: 12 x 2 [1M]
## Month KWH
## <mth> <dbl>
## 1 2014 Jan 10138894.
## 2 2014 Feb 8757344.
## 3 2014 Mar 6702667.
## 4 2014 Apr 6008940.
## 5 2014 May 5943702.
## 6 2014 Jun 8207930.
## 7 2014 Jul 9520071.
## 8 2014 Aug 10010697.
## 9 2014 Sep 8488807.
## 10 2014 Oct 5867038.
## 11 2014 Nov 6155281.
## 12 2014 Dec 8307433.
# Write forecast to a .csv file
write.csv(kwh_final, 'myrianthopoulos_kwh_forecast.csv')
Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.
# Import and inspect the data
pipe1_data <- read_xlsx('Waterflow_Pipe1.xlsx')
pipe2_data <- read_xlsx('Waterflow_Pipe2.xlsx')
print(head(pipe1_data))
## # A tibble: 6 × 2
## `Date Time` WaterFlow
## <dbl> <dbl>
## 1 42300. 23.4
## 2 42300. 28.0
## 3 42300. 23.1
## 4 42300. 30.0
## 5 42300. 6.00
## 6 42300. 15.9
print(head(pipe2_data))
## # A tibble: 6 × 2
## `Date Time` WaterFlow
## <dbl> <dbl>
## 1 42300. 18.8
## 2 42300. 43.1
## 3 42300. 38.0
## 4 42300. 36.1
## 5 42300. 31.9
## 6 42300. 28.2
# Average the WaterFlow column values for each hour and filter the index
pipe_data <- rbind(pipe1_data, pipe2_data) |> rename(date_time = 'Date Time') |> arrange(date_time)
pipe_data$date_time <- convertToDateTime(pipe_data$date_time)
pipe_data$date <- as.Date(pipe_data$date_time)
pipe_data$hour <- format(pipe_data$date_time, format = "%H")
pipe_data <- pipe_data |>
group_by(date, hour) |>
mutate(hour_avg = mean(WaterFlow)) |>
select(date, hour, hour_avg) |>
ungroup() |>
distinct() |>
mutate(date_time = paste0(date, " ", hour, ':00:00'))
pipe_data$date_time <- as.POSIXct(pipe_data$date_time)
pipe_data <- pipe_data |> select(date_time, hour_avg) |> as_tsibble(index = 'date_time')
print(head(pipe_data))
## # A tsibble: 6 x 2 [1h] <?>
## date_time hour_avg
## <dttm> <dbl>
## 1 2015-10-23 00:00:00 26.1
## 2 2015-10-23 01:00:00 18.8
## 3 2015-10-23 02:00:00 24.5
## 4 2015-10-23 03:00:00 25.6
## 5 2015-10-23 04:00:00 22.4
## 6 2015-10-23 05:00:00 23.6
pipe_data |> autoplot(hour_avg)
The amount of variance in the data is very inconsistent across this data set. A Box-Cox transformation was applied to make it more consistent.
# Find lambda
lambda_pipe <- pipe_data |>
features(hour_avg, features = guerrero) |>
pull(lambda_guerrero)
# Apply Box-Cox transformation
pipe_data_bc <- pipe_data |>
mutate(b_c = box_cox(hour_avg, lambda_pipe)) |>
select(date_time, b_c)
pipe_data_bc |> autoplot(b_c)
With the data transformed, no seasonal differencing and one order of differencing are recommended to obtain stationary data.
pipe_data_bc |> features(b_c, unitroot_nsdiffs)
## # A tibble: 1 × 1
## nsdiffs
## <int>
## 1 0
pipe_data_bc |>
features(b_c, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 1
After one order of differencing is applied, the data exhibits no trend or seasonality, and appears stationary (albeit with occasional outliers). It is therefore possible to forecast it using an ARIMA model.
pipe_data_bc |>
mutate(diff_bc = difference(b_c)) |>
autoplot(diff_bc)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
Since \(c=0\) and \(d=1\), the forecasts will stabilize at a constant (in this case, the mean of the Box-Cox transformed data) as per section 9.5 in the textbook.
# Fill gaps in data
pipe_data_bc <- fill_gaps(pipe_data_bc)
# Generate forecast
pipe_prediction <- pipe_data_bc |>
model(ARIMA(b_c)) |>
forecast(h = 168)
pipe_prediction |> autoplot(pipe_data_bc, level = NULL)
# Produce final output
pipe_final <- pipe_prediction[,-c(1,3)] |>
rename(b_c = .mean) |>
mutate(hour_avg = inv_box_cox(b_c, lambda_pipe)) |>
select(date_time, hour_avg)
print(head(pipe_final))
## # A tsibble: 6 x 2 [1h] <?>
## date_time hour_avg
## <dttm> <dbl>
## 1 2015-12-04 00:00:00 29.3
## 2 2015-12-04 01:00:00 29.6
## 3 2015-12-04 02:00:00 29.6
## 4 2015-12-04 03:00:00 29.6
## 5 2015-12-04 04:00:00 29.6
## 6 2015-12-04 05:00:00 29.6
# Write forecast to a .csv file
write.csv(pipe_final, 'myrianthopoulos_pipe_forecast.csv')