library(dplyr)
library(ggplot2)
library(tidyr)
library(tibble)
library(tsibble)
library(ggfortify)
library(tidyverse)
library(fpp3)
library(moments)
library(zoo)
library(fable)
library(brew)
library(sos)
library(slider)
library(seasonal)
library(x13binary)
library(USgas)
gas_data <- readxl::read_excel("//Users//colinadams//Documents//GCSU//Fall 2022//Forecasting//Forecasts//Forecast 3//gas_prices.xlsx")
gas_data <- gas_data %>%
filter(year(Week) > 2009) %>%
mutate(Week = yearweek(Week)) %>%
as_tsibble(index = Week)
gas_data_train <- gas_data %>%
filter(year(Week) < 2020)
gas_data_test <- gas_data %>%
filter(year(Week) > 2019)
autoplot(gas_data) +
labs(title = "US Gas Price since 2010", y = "Gas Price in Dollars", x = "Week")
## Plot variable not specified, automatically selected `.vars = Price`
gas_data_log <- gas_data %>%
mutate(Price = log(Price))
autoplot(gas_data_log)
## Plot variable not specified, automatically selected `.vars = Price`
gas_data %>%
features(Price, features = guerrero)
## # A tibble: 1 × 1
## lambda_guerrero
## <dbl>
## 1 -0.342
gas_data %>%
autoplot(box_cox(Price, -0.342)) +
labs(title = "Box-Cox Transformation of US Gas Prices", y = "Gas Price in Dollars", x = "Week")
gas_dataadd <- gas_data %>%
model(classical_decomposition(Price, type = "additive")) %>%
components() %>%
autoplot() +
labs(title = "Classical Additive Decomposition of Gas Prices")
gas_dataadd
## Warning: Removed 26 row(s) containing missing values (geom_path).
gas_datamulti <- gas_data %>%
model(classical_decomposition(Price, type = "multiplicative")) %>%
components() %>%
autoplot() +
labs(title = "Classical Multiplicative Decomposition of Gas Prices")
gas_datamulti
## Warning: Removed 26 row(s) containing missing values (geom_path).
gas_datastlperiod <- gas_data %>%
model(STL(Price ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "Periodic STL Decomposition of Gas Prices")
gas_datastlperiod
gas_datadcmp <- gas_data %>%
model(stl = STL(Price))
gas_data %>%
autoplot(Price, color = "gray") +
autolayer(components(gas_datadcmp), trend, color = "red") +
labs(y = "Gas Price in Dollars", title = "Trend of Gas Prices Since 2010")
None of the transformations I attempted reduced the variance or changed the data in any significant way. Neither the log transformation nor the Box-Cox helped in understanding the gas data.
Using three different compositions I could see the breakdown of the trend, seasonal, and random components in the data. The trend looks similar in all decomposition. The price increases, plateaus, then decreases. This pattern repeats twice in the data. There is a clear one year season. In the data we can see the effects of the 2008 recession where gas prices increased for years after it. You can see a similar effect during COVID where gas prices increase. Between these recessions gas prices steadied around $2.50/mile
gas_data_zoom <- gas_data %>%
filter(year(Week) > 2021)
gas_fc_plot <- gas_data_train %>%
model(
Mean = MEAN(Price),
Naive = NAIVE(Price),
Seasonal_naive = SNAIVE(Price),
Drift = RW(Price ~ drift()))
gas_fc_plot %>%
forecast(h = 143) %>%
autoplot(gas_data, level = NULL) +
labs(title = "Forecasts of US Gas Price", y = "Gas Price in Dollars", x = "Week") +
guides(color = guide_legend(title = "Forecast"))
gas_data_naive_train <- gas_data_train %>%
model(NAIVE(Price))
gas_data_naive_test <- gas_data_test %>%
model(NAIVE(Price))
gas_data_naive_test %>%
forecast(h = "2 weeks") %>%
autoplot(gas_data_zoom) +
labs(title = "Naive Forecast of US Gas Price", y = "Gas Price in Dollars", x = "Week")
gas_data_mean_train <- gas_data_train %>%
model(MEAN(Price))
gas_data_mean_test <- gas_data_test %>%
model(MEAN(Price))
gas_data_mean_test %>%
forecast(h = "2 weeks") %>%
autoplot(gas_data_zoom) +
labs(title = "Mean Forecast of US Gas Price", y = "Gas Price in Dollars", x = "Week")
gas_data_drift_train <- gas_data_train %>%
model(RW(Price ~ drift()))
gas_data_drift_test <- gas_data_test %>%
model(RW(Price ~ drift()))
gas_data_drift_test %>%
forecast(h = "2 weeks") %>%
autoplot(gas_data_zoom)+
labs(title = "Drift Forecast of US Gas Price", y = "Gas Price in Dollars", x = "Week")
I used the naive, mean, and drift methods for forecasting. I chose these because they provide good short-run forecasts in this situation. Gas price change in a short window of two weeks are almost random without any seasonality. There is a trend that will affect my two week forecast. These reasons are why I believe the drift forecast to be the best. The drift takes into account the randomness of gas prices in the short-run while still following the trend. The mean forecast is not good in this case because gas prices from years ago should not have a bearing on my forecast for two weeks.
accuracy(gas_fc_plot)
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mean Traini… 8.37e-17 0.539 0.470 -3.35 16.2 1.28 1.13 0.995
## 2 Naive Traini… -1.87e- 4 0.0489 0.0372 -0.0212 1.27 0.102 0.102 0.559
## 3 Seasonal_naive Traini… -1.66e- 2 0.479 0.367 -2.00 12.9 1 1 0.991
## 4 Drift Traini… -1.90e-16 0.0489 0.0372 -0.0148 1.27 0.102 0.102 0.559
gg_tsresiduals(gas_data_naive_train) +
labs(title = "Naive Forecast Residuals Test")
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).
augment(gas_data_naive_train) %>%
features(.resid, ljung_box, lag = 24, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 NAIVE(Price) 314. 0
gg_tsresiduals(gas_data_mean_train) +
labs(title = "Mean Forecast Residuals Test")
augment(gas_data_mean_train) %>%
features(.resid, ljung_box, lag = 24, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 MEAN(Price) 9138. 0
gg_tsresiduals(gas_data_drift_train) +
labs(title = "Drift Forecast Residuals Test")
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).
augment(gas_data_drift_train) %>%
features(.resid, ljung_box, lag = 24, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 RW(Price ~ drift()) 314. 0
I believe the Drift method to be the best method of the one’s I used. Drift has the lowest RMSE and MPE of the four I tested. This makes it the best forecast method for me to use. Naive is a close second to the drift method, but its MPE being higher than the Drift method makes me not choose it. The residuals for the Drift and the Naive also look to be white noice and normally distributed. Both suffer from autocorrelation up to 5 lags.
gas_stretch <- gas_data %>%
stretch_tsibble(.init = 5, .step = 1)
gas_naive_cv <- gas_stretch %>%
model(NAIVE(Price)) %>%
forecast(h = "2 weeks") %>%
accuracy(gas_data)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 2 observations are missing between 2022 W40 and 2022 W41
gas_mean_cv <- gas_stretch %>%
model(MEAN(Price)) %>%
forecast(h = "2 weeks") %>%
accuracy(gas_data)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 2 observations are missing between 2022 W40 and 2022 W41
gas_drift_cv <- gas_stretch %>%
model(RW(Price ~ drift())) %>%
forecast(h = "2 weeks") %>%
accuracy(gas_data)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 2 observations are missing between 2022 W40 and 2022 W41
gas_naive_cv
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NAIVE(Price) Test 0.00250 0.0806 0.0557 0.0435 1.85 0.118 0.131 0.761
gas_mean_cv
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MEAN(Price) Test -0.107 0.620 0.493 -7.65 17.7 1.04 1.01 0.997
gas_drift_cv
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 RW(Price ~ drift… Test -0.00110 0.0809 0.0559 -0.0510 1.86 0.118 0.131 0.760
After employing cross-validation I believe the Naive to be the best method. The accuracy measures all look more favorable for the Naive forecast than the drift. This was surprising to me after #4 because I was excepting the Drift to be the best. Because the NAIVE turned out to be better using cross-validation, this is the forecast I will choose.
gas_naive_final <- gas_data %>%
model(NAIVE(Price)) %>%
forecast(h = "2 weeks") %>%
hilo()
gas_naive_final
## # A tsibble: 2 x 6 [1W]
## # Key: .model [1]
## .model Week Price .mean `80%`
## <chr> <week> <dist> <dbl> <hilo>
## 1 NAIVE(Price) 2022 W40 N(3.8, 0.0031) 3.83 [3.760658, 3.903342]80
## 2 NAIVE(Price) 2022 W41 N(3.8, 0.0062) 3.83 [3.731107, 3.932893]80
## # … with 1 more variable: `95%` <hilo>
## # ℹ Use `colnames()` to see all variable names