Load packages and data

library(dplyr)
library(ggplot2)
library(tidyr)
library(tibble)
library(tsibble)
library(ggfortify)
library(tidyverse)
library(fpp3)
library(moments)
library(zoo)
library(fable)
library(brew)
library(sos)
library(slider)
library(seasonal)
library(x13binary)
library(USgas)

Questions

Question 1

gas_data <- readxl::read_excel("//Users//colinadams//Documents//GCSU//Fall 2022//Forecasting//Forecasts//Forecast 3//gas_prices.xlsx")

gas_data <- gas_data %>%
  filter(year(Week) > 2009) %>% 
  mutate(Week = yearweek(Week)) %>%
  as_tsibble(index = Week)

gas_data_train <- gas_data %>%
  filter(year(Week) < 2020)

gas_data_test <- gas_data %>%
  filter(year(Week) > 2019)

autoplot(gas_data) +
  labs(title = "US Gas Price since 2010", y = "Gas Price in Dollars", x = "Week")
## Plot variable not specified, automatically selected `.vars = Price`

Question 2

gas_data_log <- gas_data %>%
  mutate(Price = log(Price))
autoplot(gas_data_log)
## Plot variable not specified, automatically selected `.vars = Price`

gas_data %>%
  features(Price, features = guerrero)
## # A tibble: 1 × 1
##   lambda_guerrero
##             <dbl>
## 1          -0.342
gas_data %>%
  autoplot(box_cox(Price, -0.342)) +
  labs(title = "Box-Cox Transformation of US Gas Prices", y = "Gas Price in Dollars", x = "Week")

gas_dataadd <- gas_data %>%
  model(classical_decomposition(Price, type = "additive")) %>%
  components() %>%
  autoplot() +
  labs(title = "Classical Additive Decomposition of Gas Prices")
gas_dataadd
## Warning: Removed 26 row(s) containing missing values (geom_path).

gas_datamulti <- gas_data %>%
  model(classical_decomposition(Price, type = "multiplicative")) %>%
  components() %>%
  autoplot() +
  labs(title = "Classical Multiplicative Decomposition of Gas Prices")
gas_datamulti
## Warning: Removed 26 row(s) containing missing values (geom_path).

gas_datastlperiod <- gas_data %>%
  model(STL(Price ~ season(window = "periodic"), robust = TRUE)) %>%
  components() %>% 
  autoplot() +
  labs(title = "Periodic STL Decomposition of Gas Prices")
gas_datastlperiod

gas_datadcmp <- gas_data %>%
  model(stl = STL(Price))

gas_data %>%
  autoplot(Price, color = "gray") +
  autolayer(components(gas_datadcmp), trend, color = "red") +
  labs(y = "Gas Price in Dollars", title = "Trend of Gas Prices Since 2010")

None of the transformations I attempted reduced the variance or changed the data in any significant way. Neither the log transformation nor the Box-Cox helped in understanding the gas data.

Using three different compositions I could see the breakdown of the trend, seasonal, and random components in the data. The trend looks similar in all decomposition. The price increases, plateaus, then decreases. This pattern repeats twice in the data. There is a clear one year season. In the data we can see the effects of the 2008 recession where gas prices increased for years after it. You can see a similar effect during COVID where gas prices increase. Between these recessions gas prices steadied around $2.50/mile

Question 3

gas_data_zoom <- gas_data %>%
  filter(year(Week) > 2021)

gas_fc_plot <- gas_data_train %>%
  model(
    Mean = MEAN(Price),
    Naive = NAIVE(Price),
    Seasonal_naive = SNAIVE(Price),
    Drift = RW(Price ~ drift())) 

gas_fc_plot %>% 
  forecast(h = 143) %>% 
  autoplot(gas_data, level = NULL) +
  labs(title = "Forecasts of US Gas Price", y = "Gas Price in Dollars", x = "Week") +
  guides(color = guide_legend(title = "Forecast"))

gas_data_naive_train <- gas_data_train %>%
  model(NAIVE(Price)) 
gas_data_naive_test <- gas_data_test %>%
  model(NAIVE(Price)) 

gas_data_naive_test %>%
  forecast(h = "2 weeks") %>%
  autoplot(gas_data_zoom) +
  labs(title = "Naive Forecast of US Gas Price", y = "Gas Price in Dollars", x = "Week")

gas_data_mean_train <- gas_data_train %>%
  model(MEAN(Price)) 
gas_data_mean_test <- gas_data_test %>%
  model(MEAN(Price)) 

gas_data_mean_test %>%
  forecast(h = "2 weeks") %>%
  autoplot(gas_data_zoom) +
  labs(title = "Mean Forecast of US Gas Price", y = "Gas Price in Dollars", x = "Week")

gas_data_drift_train <- gas_data_train %>%
  model(RW(Price ~ drift())) 
gas_data_drift_test <- gas_data_test %>%
  model(RW(Price ~ drift())) 

gas_data_drift_test %>%
  forecast(h = "2 weeks") %>%
  autoplot(gas_data_zoom)+
  labs(title = "Drift Forecast of US Gas Price", y = "Gas Price in Dollars", x = "Week")

I used the naive, mean, and drift methods for forecasting. I chose these because they provide good short-run forecasts in this situation. Gas price change in a short window of two weeks are almost random without any seasonality. There is a trend that will affect my two week forecast. These reasons are why I believe the drift forecast to be the best. The drift takes into account the randomness of gas prices in the short-run while still following the trend. The mean forecast is not good in this case because gas prices from years ago should not have a bearing on my forecast for two weeks.

Question 4

accuracy(gas_fc_plot)
## # A tibble: 4 × 10
##   .model         .type          ME   RMSE    MAE     MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <chr>       <dbl>  <dbl>  <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mean           Traini…  8.37e-17 0.539  0.470  -3.35   16.2  1.28  1.13  0.995
## 2 Naive          Traini… -1.87e- 4 0.0489 0.0372 -0.0212  1.27 0.102 0.102 0.559
## 3 Seasonal_naive Traini… -1.66e- 2 0.479  0.367  -2.00   12.9  1     1     0.991
## 4 Drift          Traini… -1.90e-16 0.0489 0.0372 -0.0148  1.27 0.102 0.102 0.559
gg_tsresiduals(gas_data_naive_train) +
  labs(title = "Naive Forecast Residuals Test")
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).

augment(gas_data_naive_train) %>%
  features(.resid, ljung_box, lag = 24, dof = 0)
## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 NAIVE(Price)    314.         0
gg_tsresiduals(gas_data_mean_train) +
  labs(title = "Mean Forecast Residuals Test")

augment(gas_data_mean_train) %>%
  features(.resid, ljung_box, lag = 24, dof = 0)
## # A tibble: 1 × 3
##   .model      lb_stat lb_pvalue
##   <chr>         <dbl>     <dbl>
## 1 MEAN(Price)   9138.         0
gg_tsresiduals(gas_data_drift_train) +
  labs(title = "Drift Forecast Residuals Test")
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).

augment(gas_data_drift_train) %>%
  features(.resid, ljung_box, lag = 24, dof = 0)
## # A tibble: 1 × 3
##   .model              lb_stat lb_pvalue
##   <chr>                 <dbl>     <dbl>
## 1 RW(Price ~ drift())    314.         0

I believe the Drift method to be the best method of the one’s I used. Drift has the lowest RMSE and MPE of the four I tested. This makes it the best forecast method for me to use. Naive is a close second to the drift method, but its MPE being higher than the Drift method makes me not choose it. The residuals for the Drift and the Naive also look to be white noice and normally distributed. Both suffer from autocorrelation up to 5 lags.

Question 5

gas_stretch <- gas_data %>%
  stretch_tsibble(.init = 5, .step = 1) 

gas_naive_cv <- gas_stretch %>%
  model(NAIVE(Price)) %>%
  forecast(h = "2 weeks") %>%
  accuracy(gas_data)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 2 observations are missing between 2022 W40 and 2022 W41
gas_mean_cv <- gas_stretch %>%
  model(MEAN(Price)) %>%
  forecast(h = "2 weeks") %>%
  accuracy(gas_data)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 2 observations are missing between 2022 W40 and 2022 W41
gas_drift_cv <- gas_stretch %>%
  model(RW(Price ~ drift())) %>%
  forecast(h = "2 weeks") %>%
  accuracy(gas_data)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 2 observations are missing between 2022 W40 and 2022 W41
gas_naive_cv
## # A tibble: 1 × 10
##   .model       .type      ME   RMSE    MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>        <chr>   <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NAIVE(Price) Test  0.00250 0.0806 0.0557 0.0435  1.85 0.118 0.131 0.761
gas_mean_cv
## # A tibble: 1 × 10
##   .model      .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>       <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MEAN(Price) Test  -0.107 0.620 0.493 -7.65  17.7  1.04  1.01 0.997
gas_drift_cv
## # A tibble: 1 × 10
##   .model            .type       ME   RMSE    MAE     MPE  MAPE  MASE RMSSE  ACF1
##   <chr>             <chr>    <dbl>  <dbl>  <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 RW(Price ~ drift… Test  -0.00110 0.0809 0.0559 -0.0510  1.86 0.118 0.131 0.760

After employing cross-validation I believe the Naive to be the best method. The accuracy measures all look more favorable for the Naive forecast than the drift. This was surprising to me after #4 because I was excepting the Drift to be the best. Because the NAIVE turned out to be better using cross-validation, this is the forecast I will choose.

Question 6

gas_naive_final <- gas_data %>%
  model(NAIVE(Price)) %>%
  forecast(h = "2 weeks") %>%
  hilo()
gas_naive_final
## # A tsibble: 2 x 6 [1W]
## # Key:       .model [1]
##   .model           Week          Price .mean                  `80%`
##   <chr>          <week>         <dist> <dbl>                 <hilo>
## 1 NAIVE(Price) 2022 W40 N(3.8, 0.0031)  3.83 [3.760658, 3.903342]80
## 2 NAIVE(Price) 2022 W41 N(3.8, 0.0062)  3.83 [3.731107, 3.932893]80
## # … with 1 more variable: `95%` <hilo>
## # ℹ Use `colnames()` to see all variable names