Homework #1

knitr::opts_chunk$set(echo = TRUE)

Assignment 1: Foundational Forecasting with Time Series Regression

https://rpubs.com/Farrell/1351933

The monthly data I selected is the Average Price: Eggs, Grade A, Large (Cost per Dozen) in U.S. City Average (APU0000708111) from FRED. The data is available here: https://fred.stlouisfed.org/series/APU0000708111

Set up the Data

#to clean the whole environment
remove(list=ls())

#libraries 
library(fpp3)

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──

## ✔ tibble      3.3.0     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.2
## ✔ lubridate   1.9.4     ✔ fable       0.4.1
## ✔ ggplot2     3.5.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(fredr)
library(ggplot2)
library(patchwork)
library(tsibble)
fredr_set_key("430508d0ad1df25ecddc34d5581dafc8")

#Pull in the data
egg <- fredr("APU0000708111",
              observation_start = as.Date("1980-01-01"),
              observation_end   = as.Date("2025-08-01")
              ) |>
  mutate(Month = yearmonth(date), value) |>
  as_tsibble(index = Month)

autoplot(egg)

## Plot variable not specified, automatically selected `.vars = value`

#Setting up the data to have a test and train sample with 80%/20% 
split_index <- floor(0.8 * nrow(egg))  

train <- egg[1:split_index, ]
test  <- egg[(split_index + 1):nrow(egg), ]

Explore the Data

p_egg <- ggplot(train, aes(x = date, y = value)) +
  geom_line() +
  labs(title = "Train Egg Time Series",
       x = "Date", y = "Price") +
  theme_minimal()
 print(p_egg)

When looking at the plotted train data the fluctuations appear to grow overtime. It looks like the price for a dozen eggs has some seasonality with fluctuations. There is a trend upward. In more recent years there is more seasonality movement. There is a large jump in prices following Covid.

Decomposition

library(feasts)
library(fabletools)

#Used multiplicative decomposition rather than additive because the swings become larger over time.

classical <- train |>
  model(classical = classical_decomposition(value, type = "multiplicative")) |>
  components()

p1<- autoplot(classical) +
  labs(title = "Classical Multiplicative Decomposition of Train Egg Prices")

stl_train <- train |>
  model(stl = STL(value ~ season(window = "periodic"))) |>
  components()

p2<- autoplot(stl_train) +
  labs(title = "STL Decomposition of Train Egg Prices")

library(patchwork)
(p1 / p2)

## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_line()`).

The decompositions both show lots of seasonality and a positive upward trend. The biggest difference is that the STL decomposition shows a more even distribution of remainders for more of the data up until 2020. While the classical model shows some wider differences but it is more consistent overtime.

Transformation

library(feasts)
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

lambda <- train |> features(value, features = guerrero) |> pull(lambda_guerrero)
lambda

## [1] -0.7594015

train |> autoplot(box_cox(value,lambda)) +
labs(y = "Box-Cox train transformed Value")

#egg |>  autoplot(sqrt(value)) +
#labs(y = "Square root Egg Price")
##The sqrt transformation made little change to the plot. 

#egg |> autoplot(log(value)) +
#labs(y = "Log Value")
#Log also made little movement

#To help reduce some of the variance, used Box Cox instead it did the best out of these transformations: 
#train |> autoplot(-1 /value) +
#labs(y = "Inverse Value")

test<- test |> 
  mutate(value_bc = box_cox(value, lambda))

SNAIVE Model

library(fable)
library(fpp3)

#SNAIVE MODEL

models <- train |> model( SNAIVE  = SNAIVE(formula = value),) 

report(models|> select(SNAIVE))

## Series: value 
## Model: SNAIVE 
## 
## sigma^2: 0.0441

library(ggplot2)

# forecasts on the test data 
fc <- models |>
  forecast(h = nrow(test))

# Plot all forecasts with the training data
fc |>autoplot(train) + 
  autolayer(test, value, color = "red") +
  labs( title = "SNAIVE Model Forecast v Test",
        y = "Value",
        x = "Month" 
        ) + facet_wrap(~ .model,
                       ncol = 2
                       )

acc <- accuracy(object = fc, 
                data = test
                ) |>
  select(.model, ME, MPE, RMSE, MAE, MAPE #, MASE, RMSSE
         )

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Print with knitr::kable
kable(acc, caption = "Forecast Accuracy Metrics") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Forecast Accuracy Metrics
.model	ME	MPE	RMSE	MAE	MAPE
SNAIVE	-0.1367909	-26.04032	1.228014	0.9934818	48.75403

The ME is -0.1367909, because this value is negative it means the model has a bias that is overpredicting but only by a little bit. The MPE is -26%, this refers to the average percentage error. This is the same as the MAPE. The RMSE is 1.228014 is the typical forecast error magnitude. The MAE 0.993481 is the average absolute deviation between forecasts and actual values. All of these values are pretty low indicating the model is a decent fit.

The model does an okay job at predicting when comparing with the test data. However, it didn’t account for the large jumps in prices in the most recent year, which are likely a result of the Trump’s tariffs and inflation.

ETS Model

#ETS MODEL

fit_egg <- train |>
  model(ETS(value))

report(fit_egg)

## Series: value 
## Model: ETS(M,A,M) 
##   Smoothing parameters:
##     alpha = 0.7713571 
##     beta  = 0.0002481812 
##     gamma = 0.0001001483 
## 
##   Initial states:
##       l[0]       b[0]     s[0]    s[-1]     s[-2]    s[-3]     s[-4]     s[-5]
##  0.8095612 0.00390488 1.059403 1.018373 0.9953986 1.012661 0.9910187 0.9408362
##     s[-6]     s[-7]    s[-8]   s[-9]   s[-10]   s[-11]
##  0.932668 0.9370898 1.006205 1.02147 1.030109 1.054768
## 
##   sigma^2:  0.0031
## 
##      AIC     AICc      BIC 
## 273.6956 275.1527 343.0933

components(fit_egg) |>
  autoplot() +
  labs(title = "ETS(M,N,A) components")

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

# forecasts on the test data 
fc2 <- fit_egg |>
  forecast(h = nrow(test))

# Plot all forecasts with the training data
fc2 |>autoplot(train) + 
  labs( title = "ETS Model Forecast",
        y = "Egg Price",
        x = "Month" 
        ) + facet_wrap(~ .model,
                       ncol = 2
                       )

fc2 |> 
  autoplot() +                            # plot forecast only
  autolayer(egg, value, color = "blue") +  # full historical data
  autolayer(test, value, color = "red") +   # test series
  labs(
    title = "ETS Forecast vs Test Data",
    y = "Egg Price",
    x = "Month"
  ) +
  facet_wrap(~ .model, ncol = 2)

acc2 <- accuracy(object = fc2, 
                data = test
                ) |>
  select(.model, ME, MPE, RMSE, MAE, MAPE #, MASE, RMSSE
         )

library(kableExtra)

# Print with knitr::kable
kable(acc2, caption = "Forecast Accuracy Metrics") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Forecast Accuracy Metrics
.model	ME	MPE	RMSE	MAE	MAPE
ETS(value)	0.383787	4.200715	1.056589	0.6747246	24.44062

The ME is 0.383787, because this value is positive it means the model has a bias that is under predicting but only by a little bit. The MPE is 4.2%, this refers to the average percentage error. This is the same as the MAPE. The RMSE is 1.228014 is the typical forecast error magnitude. The MAE 0.6747246 is the average absolute deviation between forecasts and actual values. All of these values are pretty low indicating the model is a decent fit. While the SNAIVE over predicted more on average this model under predicted more on average, but from the plot its clear they were both over prior to 2020 and under after.

The model does an okay job at predicting when comparing with the test data, and has a lower MAE than the SNAIVE model. It also didn’t account for the large jumps in prices in the most recent year, which are likely a result of the Trump’s tariffs and inflation. But it did a better job at predicting the first couple of months of data before covid hit.