Homework 1

Author

Robert Jenkins

Setup

library(fpp3)
library(feasts)
library(fredr)
library(dplyr)
library(tseries)
library(fabletools)
fredr_has_key()

[1] TRUE

FRED Data

# Producer Price Index by Industry: Total Manufacturing Industries (PCUOMFGOMFG)

PPI <- fredr(series_id = "PCUOMFGOMFG") |> #Data comes in as a tibble object, date in YYYY-MM-DD, Value, realtime_start, & realtime_end -> convert the date to month-year
  transmute(month = yearmonth(date),value) |>
  as_tsibble(index = month) |>
  filter(month >= yearmonth("1986 Jan")) #There was one observation in DEC 1984 but all of 1985 is missing.

PPI #480x2 Tsibble() which makes sense because observations are 1986 to 2025 (12 Per Year)

# A tsibble: 480 x 2 [1M]
      month value
      <mth> <dbl>
 1 1986 Jan 101. 
 2 1986 Feb  99.9
 3 1986 Mar  98.3
 4 1986 Apr  97.9
 5 1986 May  98.2
 6 1986 Jun  98.2
 7 1986 Jul  97.5
 8 1986 Aug  97.4
 9 1986 Sep  97.6
10 1986 Oct  98.2
# ℹ 470 more rows

autoplot(PPI) +
  labs(
    title = "Producer Price Index: Total Manufacturing Industries",
    x = "Month",
    y = "Index 1986 = 101.3"
  ) +
  theme_bw()

Split the Data Into Train and Test

# Data Goes From 1986(Monthly) - 480 Observations - 80% = 384 - Will Leave out the Last 96 Months - 20% - as Test
split <- yearmonth(max(PPI$month)) - 96
train <- PPI |> filter(month <= split)
test <- PPI |> filter(month > split)
nrow(PPI) #check total observations - confirmed 480

[1] 480

nrow(train) #check observations on train - confirmed 384

[1] 384

nrow(test) #check observations on test - confirmed 96

[1] 96

bind_rows(train |> mutate(Set = "Train"), test  |> mutate(Set = "Test")) |>
  ggplot(aes(x = month, y = value, color = Set)) +
  geom_line() +
  labs(
    title = "Producer Price Index: Train vs Test Split (80/20)",
    x = "Month",
    y = "PPI (Index)"
  ) +
  theme_bw()

Split the Data Into Train and Test

# Plot the Training Data
autoplot(train, value) +
  labs(title = "PPI (Training Set)", x = "Month", y = "PPI (Index)") +
  theme_bw()

# Classical Decomposition
train |>
  model(classical = classical_decomposition(value, type = "additive")) |>
  components() |>
  autoplot() +
  labs(title = "Classical Decomposition (Additive) - Training Set") +
  theme_bw()

# STL Decomposition
train |>
  model(stl = STL(value)) |>
  components() |>
  autoplot() +
  labs(title = "STL Decomposition - Training Set") +
  theme_bw()

Diagnostics

p1 <- autoplot(train, value) +
  labs(title = "Training Set (Level)", x = "Month", y = "PPI (Index)") +
  theme_bw()

p2 <- train %>%
  mutate(log_value = log(value)) %>%
  autoplot(log_value) +
  labs(title = "Training Set (Log Scale)", x = "Month", y = "log(PPI)") +
  theme_bw()

library(patchwork)
p1 / p2

x <- train$value

# ADF: null = unit root (non-stationary)
adf_res <- adf.test(x)

# KPSS: null = stationary
kpss_res <- kpss.test(x)

adf_res


    Augmented Dickey-Fuller Test

data:  x
Dickey-Fuller = -1.8654, Lag order = 7, p-value = 0.634
alternative hypothesis: stationary

kpss_res


    KPSS Test for Level Stationarity

data:  x
KPSS Level = 6.2297, Truncation lag parameter = 5, p-value = 0.01

Build Models

# Build the models using the train data set.
fits <- train |>
  model(
    baseline_drift = RW(value ~ drift()),
    reg_trend_season = TSLM(value ~ trend() + season())
  )

# Regression Model Report & Coefficient
fits |> select(reg_trend_season) |> report()

Series: value 
Model: TSLM 

Residuals:
     Min       1Q   Median       3Q      Max 
-14.0865  -6.6162   0.1536   4.2487  19.3855 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    92.661134   1.521336  60.908   <2e-16 ***
trend()         0.267654   0.003563  75.114   <2e-16 ***
season()year2   0.079221   1.934146   0.041    0.967    
season()year3   0.489691   1.934156   0.253    0.800    
season()year4   0.915787   1.934172   0.473    0.636    
season()year5   1.285633   1.934195   0.665    0.507    
season()year6   1.183603   1.934225   0.612    0.541    
season()year7   1.019074   1.934261   0.527    0.599    
season()year8   0.863919   1.934303   0.447    0.655    
season()year9   0.796265   1.934353   0.412    0.681    
season()year10  0.756736   1.934408   0.391    0.696    
season()year11  0.107831   1.934471   0.056    0.956    
season()year12 -0.687948   1.934540  -0.356    0.722    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.737 on 371 degrees of freedom
Multiple R-squared: 0.9384, Adjusted R-squared: 0.9364
F-statistic: 470.7 on 12 and 371 DF, p-value: < 2.22e-16

fits |> tidy() |> filter(.model == "reg_trend_season")

# A tibble: 13 × 6
   .model           term           estimate std.error statistic   p.value
   <chr>            <chr>             <dbl>     <dbl>     <dbl>     <dbl>
 1 reg_trend_season (Intercept)     92.7      1.52      60.9    2.91e-195
 2 reg_trend_season trend()          0.268    0.00356   75.1    1.69e-226
 3 reg_trend_season season()year2    0.0792   1.93       0.0410 9.67e-  1
 4 reg_trend_season season()year3    0.490    1.93       0.253  8.00e-  1
 5 reg_trend_season season()year4    0.916    1.93       0.473  6.36e-  1
 6 reg_trend_season season()year5    1.29     1.93       0.665  5.07e-  1
 7 reg_trend_season season()year6    1.18     1.93       0.612  5.41e-  1
 8 reg_trend_season season()year7    1.02     1.93       0.527  5.99e-  1
 9 reg_trend_season season()year8    0.864    1.93       0.447  6.55e-  1
10 reg_trend_season season()year9    0.796    1.93       0.412  6.81e-  1
11 reg_trend_season season()year10   0.757    1.93       0.391  6.96e-  1
12 reg_trend_season season()year11   0.108    1.93       0.0557 9.56e-  1
13 reg_trend_season season()year12  -0.688    1.93      -0.356  7.22e-  1

# Drift Model Parameter
fits |> tidy() |> filter(.model == "baseline_drift")

# A tibble: 1 × 6
  .model         term  estimate std.error statistic  p.value
  <chr>          <chr>    <dbl>     <dbl>     <dbl>    <dbl>
1 baseline_drift b        0.235    0.0622      3.78 0.000185

Forecasts and Accuracy

#create the forecast fable
test_forecast <- fits |> forecast(new_data = test)
test_forecast

# A fable: 192 x 4 [1M]
# Key:     .model [2]
   .model            month
   <chr>             <mth>
 1 baseline_drift 2018 Jan
 2 baseline_drift 2018 Feb
 3 baseline_drift 2018 Mar
 4 baseline_drift 2018 Apr
 5 baseline_drift 2018 May
 6 baseline_drift 2018 Jun
 7 baseline_drift 2018 Jul
 8 baseline_drift 2018 Aug
 9 baseline_drift 2018 Sep
10 baseline_drift 2018 Oct
# ℹ 182 more rows
# ℹ 2 more variables: value <dist>, .mean <dbl>

#test for accuracy on RMSE, MAE, and MAPE
acc <- test_forecast |>
  accuracy(test) |>
  select(.model, RMSE, MAE, MAPE)
acc

# A tibble: 2 × 4
  .model            RMSE   MAE  MAPE
  <chr>            <dbl> <dbl> <dbl>
1 baseline_drift    30.0  23.8  9.77
2 reg_trend_season  25.1  20.4  8.47

#Remove Distribution Column that Creates Ribbons and Messy Charts
test_fc_mean <- test_forecast |>
  as_tibble() |>
  select(month, .model, .mean) |>
  as_tsibble(index = month, key = .model)

#Plot the Train, Test Actual (Dashed) and the Drift and Regression Forecasts
autoplot(train, value) +
  autolayer(test, value, linetype = "dashed") +
  autolayer(test_fc_mean, .mean) +
  labs(
    title = "Test-Set Forecast Comparison: Drift vs Trend+Season Regression",
    x = "Month",
    y = "PPI (Index)"
  ) +
  theme_bw()

#Plot Only Test Setr and the Drift and Regression Forecasts to see easier
autoplot(test, value) +
  autolayer(test_fc_mean, .mean) +
  labs(
    title = "Forecasts vs Actuals (Test Period Only)",
    x = "Month",
    y = "PPI (Index)"
  ) +
  theme_bw()