Module 10 Discussion

Author

Robert Jenkins

Setup

library(fpp3)
library(fredr)
library(dplyr)
library(tseries)
library(fabletools)
library(ggplot2)
library(readr)
library(tidyverse)
library(tsibble)
library(feasts)
library(scales)
library(patchwork)
library(ggtime)
library(tseries)
library(readxl)
library(lubridate)
library(forecast)
library(tidyquant)
library(tidyverse)
library(quantmod)
library(lubridate)
fredr_has_key()
[1] TRUE

Option 2 - Tiny Mutual Fund Example

# I picked four large companies from two sectors.
# This gives me a simple portfolio structure:
# Total portfolio -> Sector -> Individual stock

stocks <- c("AAPL", "MSFT", "XOM", "CVX")

stock_info <- tibble(
  symbol = c("AAPL", "MSFT", "XOM", "CVX"),
  sector = c("Technology", "Technology", "Energy", "Energy"),
  company = c("Apple", "Microsoft", "Exxon Mobil", "Chevron")
)

stock_info
# A tibble: 4 × 3
  symbol sector     company    
  <chr>  <chr>      <chr>      
1 AAPL   Technology Apple      
2 MSFT   Technology Microsoft  
3 XOM    Energy     Exxon Mobil
4 CVX    Energy     Chevron    
## Pull daily adjusted closing prices

# I used Yahoo Finance data from 2020 through 2024.
# The first four years will be training data and 2024 will be the test year.

getSymbols(c("AAPL", "MSFT", "XOM", "CVX"),
           src = "yahoo",
           from = "2020-01-01",
           to = "2024-12-31")
[1] "AAPL" "MSFT" "XOM"  "CVX" 
stock_prices <- data.frame(
  date = index(AAPL),
  AAPL = as.numeric(Ad(AAPL)),
  MSFT = as.numeric(Ad(MSFT)),
  XOM  = as.numeric(Ad(XOM)),
  CVX  = as.numeric(Ad(CVX))
)

head(stock_prices)
        date     AAPL     MSFT      XOM      CVX
1 2020-01-02 72.40050 152.1584 53.30640 92.01773
2 2020-01-03 71.69663 150.2637 52.87785 91.69946
3 2020-01-06 72.26793 150.6522 53.28386 91.38874
4 2020-01-07 71.92805 149.2785 52.84779 90.22180
5 2020-01-08 73.08509 151.6563 52.05081 89.19118
6 2020-01-09 74.63750 153.5509 52.44930 89.04722

Convert daily prices to monthly prices

# The assignment calls for monthly adjusted closing data.
# I kept the last adjusted closing price from each month.

monthly_prices <- stock_prices |>
  mutate(month = floor_date(date, "month")) |>
  group_by(month) |>
  summarise(AAPL = last(AAPL),MSFT = last(MSFT),XOM  = last(XOM),CVX  = last(CVX))

head(monthly_prices)
# A tibble: 6 × 5
  month       AAPL  MSFT   XOM   CVX
  <date>     <dbl> <dbl> <dbl> <dbl>
1 2020-01-01  74.6  161.  46.7  81.2
2 2020-02-01  66.1  154.  39.2  71.6
3 2020-03-01  61.4  150.  29.0  55.6
4 2020-04-01  71.0  170.  35.4  70.5
5 2020-05-01  77.0  175.  35.3  71.3
6 2020-06-01  88.4  194.  34.8  69.4
## Create equal-dollar portfolio values

# I treated this like I invested $1,000 in each stock at the beginning.
# Each stock starts at 1,000 and then moves based on its adjusted closing price.

portfolio_values <- monthly_prices |>
  mutate(AAPL_value = 1000 * AAPL / first(AAPL),MSFT_value = 1000 * MSFT / first(MSFT),XOM_value  = 1000 * XOM  / first(XOM),CVX_value  = 1000 * CVX  / first(CVX)) |>
  select(month, AAPL_value, MSFT_value, XOM_value, CVX_value)

head(portfolio_values)
# A tibble: 6 × 5
  month      AAPL_value MSFT_value XOM_value CVX_value
  <date>          <dbl>      <dbl>     <dbl>     <dbl>
1 2020-01-01      1000       1000      1000      1000 
2 2020-02-01       885.       954.      840.      881.
3 2020-03-01       824.       929.      620.      684.
4 2020-04-01       951.      1056.      759.      869.
5 2020-05-01      1032.      1082.      757.      879.
6 2020-06-01      1185.      1202.      744.      855.

Build the portfolio totals

# Technology is Apple plus Microsoft.
# Energy is Exxon Mobil plus Chevron.
# Total is the full four-stock portfolio.

portfolio_summary <- portfolio_values |>
  mutate(Technology = AAPL_value + MSFT_value,Energy = XOM_value + CVX_value,Total = Technology + Energy)

head(portfolio_summary)
# A tibble: 6 × 8
  month      AAPL_value MSFT_value XOM_value CVX_value Technology Energy Total
  <date>          <dbl>      <dbl>     <dbl>     <dbl>      <dbl>  <dbl> <dbl>
1 2020-01-01      1000       1000      1000      1000       2000   2000  4000 
2 2020-02-01       885.       954.      840.      881.      1840.  1721. 3561.
3 2020-03-01       824.       929.      620.      684.      1753.  1304. 3057.
4 2020-04-01       951.      1056.      759.      869.      2007.  1628. 3635.
5 2020-05-01      1032.      1082.      757.      879.      2115.  1635. 3750.
6 2020-06-01      1185.      1202.      744.      855.      2387.  1599. 3986.

Put the data into basic time series format

# This version is useful for viewing the individual series and plotting the base ARIMA forecasts.

ts_data <- portfolio_summary |>
  select(month, AAPL_value, MSFT_value, XOM_value, CVX_value, Technology, Energy, Total) |>
  pivot_longer(-month, names_to = "series", values_to = "value") |>
  mutate(month = yearmonth(month)) |>
  as_tsibble(key = series, index = month)

ts_data
# A tsibble: 420 x 3 [1M]
# Key:       series [7]
      month series     value
      <mth> <chr>      <dbl>
 1 2020 Jan AAPL_value 1000 
 2 2020 Feb AAPL_value  885.
 3 2020 Mar AAPL_value  824.
 4 2020 Apr AAPL_value  951.
 5 2020 May AAPL_value 1032.
 6 2020 Jun AAPL_value 1185.
 7 2020 Jul AAPL_value 1380.
 8 2020 Aug AAPL_value 1679.
 9 2020 Sep AAPL_value 1507.
10 2020 Oct AAPL_value 1417.
# ℹ 410 more rows

Train-test split

# Training period: 2020 through 2023
# Test period: 2024

train_data <- ts_data |>
  filter(month <= yearmonth("2023 Dec"))

test_data <- ts_data |>
  filter(month >= yearmonth("2024 Jan"))

train_data
# A tsibble: 336 x 3 [1M]
# Key:       series [7]
      month series     value
      <mth> <chr>      <dbl>
 1 2020 Jan AAPL_value 1000 
 2 2020 Feb AAPL_value  885.
 3 2020 Mar AAPL_value  824.
 4 2020 Apr AAPL_value  951.
 5 2020 May AAPL_value 1032.
 6 2020 Jun AAPL_value 1185.
 7 2020 Jul AAPL_value 1380.
 8 2020 Aug AAPL_value 1679.
 9 2020 Sep AAPL_value 1507.
10 2020 Oct AAPL_value 1417.
# ℹ 326 more rows
test_data
# A tsibble: 84 x 3 [1M]
# Key:       series [7]
      month series     value
      <mth> <chr>      <dbl>
 1 2024 Jan AAPL_value 2446.
 2 2024 Feb AAPL_value 2401.
 3 2024 Mar AAPL_value 2278.
 4 2024 Apr AAPL_value 2262.
 5 2024 May AAPL_value 2557.
 6 2024 Jun AAPL_value 2801.
 7 2024 Jul AAPL_value 2954.
 8 2024 Aug AAPL_value 3049.
 9 2024 Sep AAPL_value 3103.
10 2024 Oct AAPL_value 3008.
# ℹ 74 more rows

Fit base ARIMA models

# This fits an ARIMA model to each series before reconciliation.

fit <- train_data |>
  model(arima = ARIMA(value))

fit
# A mable: 7 x 2
# Key:     series [7]
  series                                  arima
  <chr>                                 <model>
1 AAPL_value                     <ARIMA(0,1,0)>
2 CVX_value                      <ARIMA(0,1,1)>
3 Energy     <ARIMA(0,1,0)(0,0,1)[12] w/ drift>
4 MSFT_value            <ARIMA(0,1,0) w/ drift>
5 Technology            <ARIMA(0,1,0) w/ drift>
6 Total                 <ARIMA(0,1,1) w/ drift>
7 XOM_value  <ARIMA(0,1,0)(0,0,1)[12] w/ drift>
fc_base <- fit |>
  forecast(h = 12)

fc_base
# A fable: 84 x 5 [1M]
# Key:     series, .model [7]
   series     .model    month
   <chr>      <chr>     <mth>
 1 AAPL_value arima  2024 Jan
 2 AAPL_value arima  2024 Feb
 3 AAPL_value arima  2024 Mar
 4 AAPL_value arima  2024 Apr
 5 AAPL_value arima  2024 May
 6 AAPL_value arima  2024 Jun
 7 AAPL_value arima  2024 Jul
 8 AAPL_value arima  2024 Aug
 9 AAPL_value arima  2024 Sep
10 AAPL_value arima  2024 Oct
# ℹ 74 more rows
# ℹ 2 more variables: value <dist>, .mean <dbl>

Plot base forecasts against actual test data

# The black line is the actual 2024 data.
# This plot is mainly to show the unreconciled ARIMA forecasts before applying HTS methods.

fc_base |>
  autoplot(train_data) +
  autolayer(test_data, value, color = "black") +
  labs(title = "Base ARIMA Forecasts vs Actual Test Data",y = "Portfolio Value",x = "Month")

Build the hierarchical structure

# For reconciliation, I need the bottom-level data first.
# The structure is:
# Total portfolio
# Technology
# AAPL, MSFT
# Energy
# XOM, CVX

bottom_data <- portfolio_values |>
  select(month, AAPL_value, MSFT_value, XOM_value, CVX_value) |>
  pivot_longer(-month, names_to = "stock", values_to = "value") |>
  mutate(
    month = yearmonth(month),
    sector = case_when(
      stock %in% c("AAPL_value", "MSFT_value") ~ "Technology",
      stock %in% c("XOM_value", "CVX_value") ~ "Energy"
    )
  ) |>
  as_tsibble(key = c(sector, stock), index = month)

bottom_data
# A tsibble: 240 x 4 [1M]
# Key:       sector, stock [4]
      month stock     value sector
      <mth> <chr>     <dbl> <chr> 
 1 2020 Jan CVX_value 1000  Energy
 2 2020 Feb CVX_value  881. Energy
 3 2020 Mar CVX_value  684. Energy
 4 2020 Apr CVX_value  869. Energy
 5 2020 May CVX_value  879. Energy
 6 2020 Jun CVX_value  855. Energy
 7 2020 Jul CVX_value  804. Energy
 8 2020 Aug CVX_value  816. Energy
 9 2020 Sep CVX_value  700. Energy
10 2020 Oct CVX_value  676. Energy
# ℹ 230 more rows

Aggregate the hierarchy

# aggregate_key() creates the sector totals and overall portfolio total from the bottom-level stock data.

hier_data <- bottom_data |>
  aggregate_key(sector / stock, value = sum(value))

hier_data
# A tsibble: 420 x 4 [1M]
# Key:       sector, stock [7]
      month sector       stock        value
      <mth> <chr*>       <chr*>       <dbl>
 1 2020 Jan <aggregated> <aggregated> 4000 
 2 2020 Feb <aggregated> <aggregated> 3561.
 3 2020 Mar <aggregated> <aggregated> 3057.
 4 2020 Apr <aggregated> <aggregated> 3635.
 5 2020 May <aggregated> <aggregated> 3750.
 6 2020 Jun <aggregated> <aggregated> 3986.
 7 2020 Jul <aggregated> <aggregated> 4096.
 8 2020 Aug <aggregated> <aggregated> 4508.
 9 2020 Sep <aggregated> <aggregated> 4035.
10 2020 Oct <aggregated> <aggregated> 3845.
# ℹ 410 more rows
## Split the hierarchical data

hier_train <- hier_data |>
  filter(month <= yearmonth("2023 Dec"))

hier_test <- hier_data |>
  filter(month >= yearmonth("2024 Jan"))

hier_train
# A tsibble: 336 x 4 [1M]
# Key:       sector, stock [7]
      month sector       stock        value
      <mth> <chr*>       <chr*>       <dbl>
 1 2020 Jan <aggregated> <aggregated> 4000 
 2 2020 Feb <aggregated> <aggregated> 3561.
 3 2020 Mar <aggregated> <aggregated> 3057.
 4 2020 Apr <aggregated> <aggregated> 3635.
 5 2020 May <aggregated> <aggregated> 3750.
 6 2020 Jun <aggregated> <aggregated> 3986.
 7 2020 Jul <aggregated> <aggregated> 4096.
 8 2020 Aug <aggregated> <aggregated> 4508.
 9 2020 Sep <aggregated> <aggregated> 4035.
10 2020 Oct <aggregated> <aggregated> 3845.
# ℹ 326 more rows
hier_test
# A tsibble: 84 x 4 [1M]
# Key:       sector, stock [7]
      month sector       stock        value
      <mth> <chr*>       <chr*>       <dbl>
 1 2024 Jan <aggregated> <aggregated> 8557.
 2 2024 Feb <aggregated> <aggregated> 8737.
 3 2024 Mar <aggregated> <aggregated> 8957.
 4 2024 Apr <aggregated> <aggregated> 8830.
 5 2024 May <aggregated> <aggregated> 9316.
 6 2024 Jun <aggregated> <aggregated> 9645.
 7 2024 Jul <aggregated> <aggregated> 9739.
 8 2024 Aug <aggregated> <aggregated> 9713.
 9 2024 Sep <aggregated> <aggregated> 9825.
10 2024 Oct <aggregated> <aggregated> 9593.
# ℹ 74 more rows

Fit ARIMA models across the hierarchy

# This fits models at every level: total, sector, and individual stock.

hier_fit <- hier_train |>
  model(arima = ARIMA(value))

hier_fit
# A mable: 7 x 3
# Key:     sector, stock [7]
  sector       stock                                     arima
  <chr*>       <chr*>                                  <model>
1 Energy       CVX_value                        <ARIMA(0,1,1)>
2 Energy       XOM_value    <ARIMA(0,1,0)(0,0,1)[12] w/ drift>
3 Energy       <aggregated> <ARIMA(0,1,0)(0,0,1)[12] w/ drift>
4 Technology   AAPL_value                       <ARIMA(0,1,0)>
5 Technology   MSFT_value              <ARIMA(0,1,0) w/ drift>
6 Technology   <aggregated>            <ARIMA(0,1,0) w/ drift>
7 <aggregated> <aggregated>            <ARIMA(0,1,1) w/ drift>
hier_fc_base <- hier_fit |>
  forecast(h = 12)

hier_fc_base
# A fable: 84 x 6 [1M]
# Key:     sector, stock, .model [7]
   sector stock     .model    month
   <chr*> <chr*>    <chr>     <mth>
 1 Energy CVX_value arima  2024 Jan
 2 Energy CVX_value arima  2024 Feb
 3 Energy CVX_value arima  2024 Mar
 4 Energy CVX_value arima  2024 Apr
 5 Energy CVX_value arima  2024 May
 6 Energy CVX_value arima  2024 Jun
 7 Energy CVX_value arima  2024 Jul
 8 Energy CVX_value arima  2024 Aug
 9 Energy CVX_value arima  2024 Sep
10 Energy CVX_value arima  2024 Oct
# ℹ 74 more rows
# ℹ 2 more variables: value <dist>, .mean <dbl>

Bottom-up reconciliation

# Bottom-up forecasts the individual stocks and then rolls them up to sector and total levels.

hier_fc_bu <- hier_fit |>
  reconcile(
    bottom_up = bottom_up(arima)
  ) |>
  forecast(h = 12)

hier_fc_bu
# A fable: 168 x 6 [1M]
# Key:     sector, stock, .model [14]
   sector stock     .model    month
   <chr*> <chr*>    <chr>     <mth>
 1 Energy CVX_value arima  2024 Jan
 2 Energy CVX_value arima  2024 Feb
 3 Energy CVX_value arima  2024 Mar
 4 Energy CVX_value arima  2024 Apr
 5 Energy CVX_value arima  2024 May
 6 Energy CVX_value arima  2024 Jun
 7 Energy CVX_value arima  2024 Jul
 8 Energy CVX_value arima  2024 Aug
 9 Energy CVX_value arima  2024 Sep
10 Energy CVX_value arima  2024 Oct
# ℹ 158 more rows
# ℹ 2 more variables: value <dist>, .mean <dbl>

Top-down reconciliation

# Top-down starts with the total portfolio forecast and pushes it down into the lower levels.

hier_fc_td <- hier_fit |>
  reconcile(
    top_down = top_down(arima)
  ) |>
  forecast(h = 12)

hier_fc_td
# A fable: 168 x 6 [1M]
# Key:     sector, stock, .model [14]
   sector stock     .model    month
   <chr*> <chr*>    <chr>     <mth>
 1 Energy CVX_value arima  2024 Jan
 2 Energy CVX_value arima  2024 Feb
 3 Energy CVX_value arima  2024 Mar
 4 Energy CVX_value arima  2024 Apr
 5 Energy CVX_value arima  2024 May
 6 Energy CVX_value arima  2024 Jun
 7 Energy CVX_value arima  2024 Jul
 8 Energy CVX_value arima  2024 Aug
 9 Energy CVX_value arima  2024 Sep
10 Energy CVX_value arima  2024 Oct
# ℹ 158 more rows
# ℹ 2 more variables: value <dist>, .mean <dbl>

MinT reconciliation

# MinT uses forecast error relationships across the hierarchy to adjust the forecasts.

hier_fc_mint <- hier_fit |>
  reconcile(
    mint = min_trace(arima)
  ) |>
  forecast(h = 12)

hier_fc_mint
# A fable: 168 x 6 [1M]
# Key:     sector, stock, .model [14]
   sector stock     .model    month
   <chr*> <chr*>    <chr>     <mth>
 1 Energy CVX_value arima  2024 Jan
 2 Energy CVX_value arima  2024 Feb
 3 Energy CVX_value arima  2024 Mar
 4 Energy CVX_value arima  2024 Apr
 5 Energy CVX_value arima  2024 May
 6 Energy CVX_value arima  2024 Jun
 7 Energy CVX_value arima  2024 Jul
 8 Energy CVX_value arima  2024 Aug
 9 Energy CVX_value arima  2024 Sep
10 Energy CVX_value arima  2024 Oct
# ℹ 158 more rows
# ℹ 2 more variables: value <dist>, .mean <dbl>

Middle-out reconciliation

# Middle-out uses the sector level as the middle point.
# It reconciles upward to the total and downward to the individual stocks.

hier_fc_mo <- hier_fit |>
  reconcile(
    middle_out = middle_out(arima, split = 1)
  ) |>
  forecast(h = 12)

hier_fc_mo
# A fable: 168 x 6 [1M]
# Key:     sector, stock, .model [14]
   sector stock     .model    month
   <chr*> <chr*>    <chr>     <mth>
 1 Energy CVX_value arima  2024 Jan
 2 Energy CVX_value arima  2024 Feb
 3 Energy CVX_value arima  2024 Mar
 4 Energy CVX_value arima  2024 Apr
 5 Energy CVX_value arima  2024 May
 6 Energy CVX_value arima  2024 Jun
 7 Energy CVX_value arima  2024 Jul
 8 Energy CVX_value arima  2024 Aug
 9 Energy CVX_value arima  2024 Sep
10 Energy CVX_value arima  2024 Oct
# ℹ 158 more rows
# ℹ 2 more variables: value <dist>, .mean <dbl>

Combine the reconciled forecasts

# Each forecast object also keeps the original ARIMA model, so I combine them first and then filter to only the reconciliation methods.

all_forecasts <- as_tibble(hier_fc_bu) |>
  bind_rows(as_tibble(hier_fc_td)) |>
  bind_rows(as_tibble(hier_fc_mint)) |>
  bind_rows(as_tibble(hier_fc_mo))

all_forecasts
# A tibble: 672 × 6
   sector stock     .model    month
   <chr*> <chr*>    <chr>     <mth>
 1 Energy CVX_value arima  2024 Jan
 2 Energy CVX_value arima  2024 Feb
 3 Energy CVX_value arima  2024 Mar
 4 Energy CVX_value arima  2024 Apr
 5 Energy CVX_value arima  2024 May
 6 Energy CVX_value arima  2024 Jun
 7 Energy CVX_value arima  2024 Jul
 8 Energy CVX_value arima  2024 Aug
 9 Energy CVX_value arima  2024 Sep
10 Energy CVX_value arima  2024 Oct
# ℹ 662 more rows
# ℹ 2 more variables: value <dist>, .mean <dbl>
all_forecasts_clean <- all_forecasts |>
  filter(.model %in% c("bottom_up", "top_down", "mint", "middle_out"))

all_forecasts_clean
# A tibble: 336 × 6
   sector stock     .model       month
   <chr*> <chr*>    <chr>        <mth>
 1 Energy CVX_value bottom_up 2024 Jan
 2 Energy CVX_value bottom_up 2024 Feb
 3 Energy CVX_value bottom_up 2024 Mar
 4 Energy CVX_value bottom_up 2024 Apr
 5 Energy CVX_value bottom_up 2024 May
 6 Energy CVX_value bottom_up 2024 Jun
 7 Energy CVX_value bottom_up 2024 Jul
 8 Energy CVX_value bottom_up 2024 Aug
 9 Energy CVX_value bottom_up 2024 Sep
10 Energy CVX_value bottom_up 2024 Oct
# ℹ 326 more rows
# ℹ 2 more variables: value <dist>, .mean <dbl>

Accuracy comparison

#This compares each reconciled forecast against the 2024 test data.

accuracy_results <- all_forecasts_clean |>
  as_tsibble(
    key = c(sector, stock, .model),
    index = month
  ) |>
  as_fable(
    response = "value",
    distribution = value
  ) |>
  accuracy(hier_test)

accuracy_results
# A tibble: 28 × 12
   .model  sector     stock      .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE
   <chr>   <chr*>     <chr*>     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 bottom… Energy     CVX_value  Test  104.  128.  105.  5.74   5.83   NaN   NaN
 2 bottom… Energy     XOM_value  Test  166.  213.  198.  7.05   8.52   NaN   NaN
 3 bottom… Energy     <aggregat… Test  270.  326.  300.  6.50   7.28   NaN   NaN
 4 bottom… Technology AAPL_value Test  228.  428.  366.  6.58  12.6    NaN   NaN
 5 bottom… Technology MSFT_value Test   81.1 127.  100.  3.12   3.88   NaN   NaN
 6 bottom… Technology <aggregat… Test  309.  455.  376.  5.33   6.76   NaN   NaN
 7 bottom… <aggregat… <aggregat… Test  579.  651.  579.  6.02   6.02   NaN   NaN
 8 middle… Energy     CVX_value  Test   19.8  87.5  78.4 0.951  4.43   NaN   NaN
 9 middle… Energy     XOM_value  Test   54.1 183.  143.  2.24   6.24   NaN   NaN
10 middle… Energy     <aggregat… Test   73.9 255.  188.  1.70   4.63   NaN   NaN
# ℹ 18 more rows
# ℹ 1 more variable: ACF1 <dbl>

Summary by reconciliation method

# I averaged the accuracy measures across the hierarchy.
# The lowest average RMSE is the best-performing method.

accuracy_summary <- accuracy_results |>
  group_by(.model) |>
  summarise(
    Avg_RMSE = mean(RMSE, na.rm = TRUE),
    Avg_MAE = mean(MAE, na.rm = TRUE),
    Avg_MAPE = mean(MAPE, na.rm = TRUE)
  ) |>
  arrange(Avg_RMSE)

accuracy_summary
# A tibble: 4 × 4
  .model     Avg_RMSE Avg_MAE Avg_MAPE
  <chr>         <dbl>   <dbl>    <dbl>
1 middle_out     216.    180.     5.28
2 top_down       219.    183.     5.29
3 mint           256.    219.     5.83
4 bottom_up      332.    289.     7.26