For this assignment I used data on crude oil prices. Specifically, I looked at crude oil domestic first purchase prices, measured in dollars per barrel. I used data from 2000 to 2004 to train my models and then tested my model forecasts on 2005 data.

Libraries

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(stats)
library(smooth)
## Loading required package: greybox
## Package "greybox", v1.0.8 loaded.
## If you want to know more about the greybox and forecasting, you can visit my website: https://forecasting.svetunkov.ru/
## This is package "smooth", v3.2.1
## If you want to know more about the smooth package and forecasting, you can visit my website: https://forecasting.svetunkov.ru/
library(fpp3)
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
## ✔ tibble      3.2.1     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.0     ✔ feasts      0.3.1
## ✔ lubridate   1.9.2     ✔ fable       0.3.3
## ✔ tsibble     1.1.3     ✔ fabletools  0.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ lubridate::hm()      masks greybox::hm()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ fabletools::MAE()    masks greybox::MAE()
## ✖ fabletools::MAPE()   masks greybox::MAPE()
## ✖ fabletools::MASE()   masks greybox::MASE()
## ✖ fabletools::ME()     masks greybox::ME()
## ✖ tsibble::measures()  masks greybox::measures()
## ✖ fabletools::MPE()    masks greybox::MPE()
## ✖ fabletools::MSE()    masks greybox::MSE()
## ✖ fabletools::RMSSE()  masks greybox::RMSSE()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tidyr::spread()      masks greybox::spread()
## ✖ tsibble::union()     masks base::union()
library(rstudioapi)
library(stats)
library(seasonal)
## 
## Attaching package: 'seasonal'
## The following object is masked from 'package:tibble':
## 
##     view
## The following object is masked from 'package:greybox':
## 
##     qs

Importing Data

setwd("~/Documents/MSAE/Predictive Analytics")
crudeoil_data <- read_excel("Crude Oil Data.xlsx")

Data Editing

crudeoil_data<- crudeoil_data %>% 
  mutate(Month = yearmonth(Month)) %>% 
  tsibble(index = Month)

crudeoil_data$Price <- as.numeric(crudeoil_data$Price)
## Warning: NAs introduced by coercion

Training Set (2000-2004) & Test Set (2005)

training <- crudeoil_data %>% 
  filter(year(Month) < 2005 & year(Month) > 1999 )

test <- crudeoil_data %>% 
  filter(year(Month) == 2005)

crudeoil_data %>% autoplot(Price)
## Warning: Removed 1 row containing missing values (`geom_line()`).

Seasonal Plot

There does not appear to be any significant seasonality in crude oil prices. In 2000, 2002, and 2004 prices were higher in the latter half of the year, peaking in October and November. However, there was a significant decline in prices over the course of 2001 and prices remained relatively consistent in 2003.

training %>%  gg_season(Price)

Models

Naive Model

NAIVE <- training %>% model(NAIVE(Price))
report(NAIVE)
## Series: Price 
## Model: NAIVE 
## 
## sigma^2: 4.746
NAIVE_fcst <- NAIVE %>% forecast(test)

NAIVE_plot <- NAIVE %>% forecast(test) %>% autoplot(test)
NAIVE_plot

Naive_metrics <- accuracy(NAIVE_fcst, test)

Drift Model

Drift <- training %>% model(RW(Price ~ drift()))
report(Drift)
## Series: Price 
## Model: RW w/ drift 
## 
## Drift: 0.249 (se: 0.2836)
## sigma^2: 4.746
Drift_fcst <- Drift %>% forecast(test)
Drift_plot <- Drift %>% forecast(test) %>% autoplot(test)
Drift_plot

Drift_metrics <- accuracy(Drift_fcst, test)

SNAIVE Model

SNAIVE <- training %>% model(SNAIVE(Price ~ lag("year")))
report(SNAIVE)
## Series: Price 
## Model: SNAIVE 
## 
## sigma^2: 54.4972
SNAIVE_fcst <- SNAIVE %>% forecast(test)

SNAIVE_plot <- SNAIVE %>% forecast(test) %>% autoplot(test)
SNAIVE_plot

SNAIVE_metrics <- accuracy(SNAIVE_fcst, test)

ETS Model

ets_auto <- training %>% model(ETS(Price))
ets_auto_fcst <- ets_auto %>% forecast(test)
report(ets_auto)
## Series: Price 
## Model: ETS(M,N,N) 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##      l[0]
##  23.39133
## 
##   sigma^2:  0.0062
## 
##      AIC     AICc      BIC 
## 337.0430 337.4716 343.3261
ets_auto_metrics <- accuracy(ets_auto_fcst, test)

# ETS auto function produces ANN model, which is the same thing as NAIVE model. 

#MAN - ETS MODEL

ETS_MAN <- training %>% 
  model(ETS(Price ~ error("M") + trend("A") + season("N")))

report(ETS_MAN)
## Series: Price 
## Model: ETS(M,A,N) 
##   Smoothing parameters:
##     alpha = 0.9998999 
##     beta  = 0.0001000248 
## 
##   Initial states:
##      l[0]     b[0]
##  23.27994 0.327565
## 
##   sigma^2:  0.0061
## 
##      AIC     AICc      BIC 
## 339.3156 340.4267 349.7873
ETS_MAN_fcst <- ETS_MAN %>% forecast(test)
ETS_MAN_plot <- ETS_MAN %>% forecast(test) %>% autoplot(test)
ETS_MAN_plot

ETS_MAN_metrics <- accuracy(ETS_MAN_fcst, test)

#MAA - ETS MODEL

ETS_MAA <- training %>% 
  model(ETS(Price ~ error("M") + trend("A") + season("A")))

report(ETS_MAA)
## Series: Price 
## Model: ETS(M,A,A) 
##   Smoothing parameters:
##     alpha = 0.9942258 
##     beta  = 0.000147891 
##     gamma = 0.0002278658 
## 
##   Initial states:
##      l[0]      b[0]      s[0]      s[-1]     s[-2]    s[-3]    s[-4]     s[-5]
##  27.85742 0.1770931 -1.489105 -0.5931149 0.6130331 1.133537 1.048713 0.4714473
##        s[-6]     s[-7]     s[-8]     s[-9]     s[-10]    s[-11]
##  -0.06405145 0.2910071 -0.572444 0.2784932 0.01670622 -1.134222
## 
##   sigma^2:  0.0072
## 
##      AIC     AICc      BIC 
## 357.7799 372.3513 393.3837
ETS_MAA_fcst <- ETS_MAA %>% forecast(test)
ETS_MAA_plot <- ETS_MAA %>% forecast(test) %>% autoplot(test)
ETS_MAA_plot

ETS_MAA_metrics <- accuracy(ETS_MAA_fcst, test)

Model Forecasts Plotted

fit <- training %>% 
  model('Naive' = NAIVE(Price),
        'Seasonal Naive' = SNAIVE(Price),
        'Drift' = RW(Price ~ drift()),
        'ETS MAN' = ETS(Price ~ error("M") + trend("A") + season("N")),
        'ETS MAA' = ETS(Price ~ error("M") + trend("A") + season("A"))
                     )

fcst <- fit %>% forecast(test)

fcst %>% 
  autoplot(training, level = NULL) +
  autolayer(test)
## Plot variable not specified, automatically selected `.vars = Price`

Accuracy Metrics

accuracy_metrics = rbind(ETS_MAN_metrics, ETS_MAA_metrics, SNAIVE_metrics, Naive_metrics, Drift_metrics)
accuracy_metrics <- accuracy_metrics %>% arrange(RMSE)
accuracy_metrics
## # A tibble: 5 × 10
##   .model                   .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "ETS(Price ~ error(\"M\… Test   9.65  11.0  9.65  18.1  18.1   NaN   NaN 0.699
## 2 "ETS(Price ~ error(\"M\… Test  10.2   11.5 10.2   19.2  19.2   NaN   NaN 0.688
## 3 "RW(Price ~ drift())"    Test  10.7   12.0 10.7   20.1  20.1   NaN   NaN 0.697
## 4 "NAIVE(Price)"           Test  12.3   13.8 12.3   23.2  23.2   NaN   NaN 0.719
## 5 "SNAIVE(Price ~ lag(\"y… Test  13.7   14.1 13.7   26.9  26.9   NaN   NaN 0.253

Discussion

The ETS model is the strongest model, producing the lowest value across all accuracy metrics (ME, RMSE, MAE, MPE, and MAPE). Since visually it did not look like there was strong seasonality, I did look at two ETS models - MAA with additive seasonality, and MAN without seasonality. Surprisingly, the MAA ETS actually produces a better fit forecast and had a lower AIC then the MAN ETS model. The drift model was very similar to the ETS with only the trend component, but ultimately was a poorer fit. The Naive model and Seasonal Naive model did not perform very well as they both ignored the strong positive trend in the data. For additional analysis, I think a larger training data set would be helpful to determine the true seasonality of this data since outliers in such a small time period could be skewing the forecast.