Import Packages & Data

# import required packages
library(readr)
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(hts)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(tidyr)
library(tsibble)

## 
## Attaching package: 'tsibble'

## The following object is masked from 'package:lubridate':
## 
##     interval

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(fable)

## Loading required package: fabletools

library(fabletools)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ purrr   1.0.2     ✔ tibble  3.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tsibbledata)
library(fpp3)

## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
## ✔ feasts 0.3.2     
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

# import csv files
apple_df <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/week_5/Apple.csv")

## Rows: 1258 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Date, Close/Last, Open, High, Low
## dbl (1): Volume
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

GM_df <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/week_5/GM.csv")

## Rows: 1258 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Date, Close/Last, Open, High, Low
## dbl (1): Volume
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

microsoft_df <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/week_5/microsoft.csv")

## Rows: 1258 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Date, Close/Last, Open, High, Low
## dbl (1): Volume
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ford_df <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/week_5/ford.csv")

## Rows: 1258 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Date, Close/Last, Open, High, Low
## dbl (1): Volume
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Manipulation

# joining all closing prices into a single dataframe
stocks_df <- data.frame(apple_df$Date, apple_df$`Close/Last`, microsoft_df$`Close/Last`, ford_df$`Close/Last`, GM_df$`Close/Last`)

# renaming columns of closing prices to their respective names
names(stocks_df)[1] <- "date"
names(stocks_df)[2] <- "apple"
names(stocks_df)[3] <- "microsoft"
names(stocks_df)[4] <- "ford"
names(stocks_df)[5] <- "GM"

# format date and converting it to ascending order
stocks_df$date <- as.Date(stocks_df$date, format = "%m/%d/%Y")
stocks_df <- stocks_df[order(as.Date(stocks_df$date, format="%m/%d/%Y")),]
head(stocks_df)

##            date    apple microsoft  ford     GM
## 1258 2019-04-10  $50.155   $120.19 $9.33 $39.25
## 1257 2019-04-11 $49.7375   $120.33 $9.39 $39.33
## 1256 2019-04-12 $49.7175   $120.95 $9.45 $39.71
## 1255 2019-04-15 $49.8075   $121.05 $9.33 $39.57
## 1254 2019-04-16 $49.8125   $120.77 $9.36 $39.66
## 1253 2019-04-17 $50.7825   $121.77 $9.50 $39.99

# converting prices to numeric columns and removing dollar signs 

columns <-c("apple", "microsoft", "ford", "GM")
stocks_df[, columns] <- lapply(columns, function(x) as.numeric(gsub("\\$", "", stocks_df[[x]])))

# Extract year and month from the Date column
stocks_df$Year <- format(stocks_df$date, "%Y")
stocks_df$Month <- format(stocks_df$date, "%m")

# calculating average closing prices by month
monthly_stocks_df <- stocks_df %>%
  group_by(Month, Year) %>%
  summarise(apple = mean(apple), microsoft = mean(microsoft), ford = mean(ford), GM = mean(GM))

# Convert Year and Month back to factors
monthly_stocks_df$Year <- factor(monthly_stocks_df$Year)
monthly_stocks_df$Month <- factor(monthly_stocks_df$Month)

# Combine Year and Month columns into a single Date column
monthly_stocks_df$date <- as.Date(paste(monthly_stocks_df$Year, monthly_stocks_df$Month, "01", sep = "-"))

# Convert Date variable to Date format
monthly_stocks_df$date <- as.Date(monthly_stocks_df$date, format = "%Y/%m/%d")

monthly_stocks_df <- monthly_stocks_df[order(as.Date(monthly_stocks_df$date, format="%Y/%m/%d")),]

# removing all columns except closing prices to calculate rate and value

monthly_stocks_df <- monthly_stocks_df %>%
  ungroup() %>%
  select(-Month, -Year, -date)


head(monthly_stocks_df)

## # A tibble: 6 × 4
##   apple microsoft  ford    GM
##   <dbl>     <dbl> <dbl> <dbl>
## 1  50.7      124.  9.65  39.6
## 2  47.8      126. 10.1   36.8
## 3  48.2      132.  9.94  36.4
## 4  51.3      138. 10.1   39.4
## 5  51.2      136.  9.13  37.9
## 6  54.5      138.  9.26  38.1

# full set of the time series data
monthly_stocks_df=monthly_stocks_df[1:60,]
# accumulation rate
monthly_stocks_df[2:60,]=1+(monthly_stocks_df[2:60,]-monthly_stocks_df[1:59,])/monthly_stocks_df[1:59,] 

# value per month
for (i in 2:60) monthly_stocks_df[i,]=monthly_stocks_df[i-1, ]*monthly_stocks_df[i,]

# adding date back into the dataframe
monthly_stocks_df$date=yearmonth(seq(as.Date("2019-04-01"), as.Date("2024/03/31"), by="months"))

# train test split
train=monthly_stocks_df[1:48,]
test=monthly_stocks_df[49:60,]

# Time Series
full_ts=monthly_stocks_df%>%as_tsibble(index=date)
train_ts=train%>%as_tsibble(index=date)
test_ts=test%>%as_tsibble(index=date)

# convert to time series
full=full_ts%>%pivot_longer(!c(date), names_to='Stock', values_to='Value')
train=train_ts%>%pivot_longer(!c(date), names_to='Stock', values_to='Value')
test=test_ts%>%pivot_longer(!c(date), names_to='Stock', values_to='Value')

# assigning sectors
full$Sector=rep('Tech', nrow(full))
train$Sector=rep('Tech', nrow(train))
test$Sector=rep('Tech', nrow(test))

full$Sector[full$Stock=="GM"|full$Stock=="ford"]="Auto"
train$Sector[train$Stock=="GM"|train$Stock=="ford"]="Auto"
test$Sector[test$Stock=="GM"|test$Stock=="ford"]="Auto"

full %>% autoplot() + ggtitle("Monthly Closing Stock Prices") + xlab("Date (Year Month)") + ylab("Dollars")

## Plot variable not specified, automatically selected `.vars = Value`

# aggregate time series for plotting
myagg <- full |>
  aggregate_key(Stock/Sector,Value=sum(Value))
myagg

## # A tsibble: 540 x 4 [1M]
## # Key:       Stock, Sector [9]
##        date Stock        Sector       Value
##       <mth> <chr*>       <chr*>       <dbl>
##  1 2019 Apr <aggregated> <aggregated>  224.
##  2 2019 May <aggregated> <aggregated>  221.
##  3 2019 Jun <aggregated> <aggregated>  227.
##  4 2019 Jul <aggregated> <aggregated>  239.
##  5 2019 Aug <aggregated> <aggregated>  235.
##  6 2019 Sep <aggregated> <aggregated>  240.
##  7 2019 Oct <aggregated> <aggregated>  243.
##  8 2019 Nov <aggregated> <aggregated>  260.
##  9 2019 Dec <aggregated> <aggregated>  269.
## 10 2020 Jan <aggregated> <aggregated>  286.
## # ℹ 530 more rows

# plot aggregated time series
myagg%>%filter(is_aggregated(Sector))%>%
  autoplot(Value) +
  labs(y = "Value",
       title = "Value of Investment") +
  facet_wrap(vars(Stock), scales = "free_y", ncol = 3) +
  theme(legend.position = "none", axis.text.x = element_text(angle = 90, hjust = 1))

Bottom-up Method

myagg2 <- train |>aggregate_key(Stock,Value=mean(Value))
m1=myagg2|> model(ets = ETS(Value))|>reconcile(bu = bottom_up(ets))
aug1=m1%>%augment()
f1=m1%>%forecast(h=12)
f1%>%autoplot(full)

acc1=f1%>%accuracy(test |>aggregate_key(Stock,Value=mean(Value)))
print(acc1)

## # A tibble: 10 × 11
##    .model Stock        .type       ME   RMSE     MAE      MPE   MAPE  MASE RMSSE
##    <chr>  <chr*>       <chr>    <dbl>  <dbl>   <dbl>    <dbl>  <dbl> <dbl> <dbl>
##  1 bu     GM           Test    -1.28    3.86   3.16    -4.82    9.50   NaN   NaN
##  2 bu     apple        Test    10.1    14.0   12.1      5.45    6.58   NaN   NaN
##  3 bu     ford         Test     0.170   1.06   0.772    0.676   6.22   NaN   NaN
##  4 bu     microsoft    Test    58.0    63.3   58.0     15.9    15.9    NaN   NaN
##  5 bu     <aggregated> Test  -368.    368.   368.    -255.    255.     NaN   NaN
##  6 ets    GM           Test    -1.28    3.86   3.16    -4.82    9.50   NaN   NaN
##  7 ets    apple        Test    10.1    14.0   12.1      5.45    6.58   NaN   NaN
##  8 ets    ford         Test     0.170   1.06   0.772    0.676   6.22   NaN   NaN
##  9 ets    microsoft    Test    58.0    63.3   58.0     15.9    15.9    NaN   NaN
## 10 ets    <aggregated> Test    15.7    16.9   15.7     10.6    10.6    NaN   NaN
## # ℹ 1 more variable: ACF1 <dbl>

Top-down Method

myagg3 <- train |>aggregate_key(Sector/Stock,Value=mean(Value))
m2=myagg3|> model(ets2 = ETS(Value))|>reconcile(td = top_down(ets2))
f2=m2%>%forecast(h=12)
f2%>%autoplot(full%>%aggregate_key(Sector/Stock,Value=mean(Value)))+facet_wrap(~Stock+Sector) +
   theme(axis.text.x = element_text(angle = 90, hjust = 1))

acc2=f2%>%accuracy(test|>aggregate_key(Sector/Stock,Value=mean(Value)))
print(acc2)

## # A tibble: 14 × 12
##    .model Sector      Stock      .type      ME   RMSE     MAE    MPE  MAPE  MASE
##    <chr>  <chr*>      <chr*>     <chr>   <dbl>  <dbl>   <dbl>  <dbl> <dbl> <dbl>
##  1 ets2   Auto        GM         Test   -1.28    3.86   3.16  -4.82   9.50   NaN
##  2 ets2   Auto        ford       Test    0.170   1.06   0.772  0.676  6.22   NaN
##  3 ets2   Auto        <aggregat… Test   -0.557   2.28   1.90  -3.29   8.38   NaN
##  4 ets2   Tech        apple      Test   10.1    14.0   12.1    5.45   6.58   NaN
##  5 ets2   Tech        microsoft  Test   58.0    63.3   58.0   15.9   15.9    NaN
##  6 ets2   Tech        <aggregat… Test   32.0    34.0   32.0   11.8   11.8    NaN
##  7 ets2   <aggregate… <aggregat… Test   15.7    16.9   15.7   10.6   10.6    NaN
##  8 td     Auto        GM         Test   25.9    26.2   25.9   73.8   73.8    NaN
##  9 td     Auto        ford       Test    9.22    9.28   9.22  75.2   75.2    NaN
## 10 td     Auto        <aggregat… Test   11.5    11.7   11.5   48.3   48.3    NaN
## 11 td     Tech        apple      Test  138.    138.   138.    76.1   76.1    NaN
## 12 td     Tech        microsoft  Test  277.    279.   277.    78.8   78.8    NaN
## 13 td     Tech        <aggregat… Test  149.    150.   149.    55.9   55.9    NaN
## 14 td     <aggregate… <aggregat… Test   15.7    16.9   15.7   10.6   10.6    NaN
## # ℹ 2 more variables: RMSSE <dbl>, ACF1 <dbl>

Middle-out Method

#Middle Out Forecast
myagg4 <-train |>aggregate_key(Sector/Stock,Value=mean(Value))
m3=myagg4|>  model(ets3 = ETS(Value)) |>  reconcile(md = middle_out(ets3))
f3=m3%>%forecast(h=12)
f3%>%autoplot(full%>%aggregate_key(Sector/Stock,Value=mean(Value)))+facet_wrap(~Stock+Sector) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

acc3=f3%>%accuracy(test|>aggregate_key(Sector/Stock,Value=mean(Value)))
print(acc3)

## # A tibble: 14 × 12
##    .model Sector       Stock        .type       ME   RMSE     MAE     MPE  MAPE
##    <chr>  <chr*>       <chr*>       <chr>    <dbl>  <dbl>   <dbl>   <dbl> <dbl>
##  1 ets3   Auto         GM           Test    -1.28    3.86   3.16   -4.82   9.50
##  2 ets3   Auto         ford         Test     0.170   1.06   0.772   0.676  6.22
##  3 ets3   Auto         <aggregated> Test    -0.557   2.28   1.90   -3.29   8.38
##  4 ets3   Tech         apple        Test    10.1    14.0   12.1     5.45   6.58
##  5 ets3   Tech         microsoft    Test    58.0    63.3   58.0    15.9   15.9 
##  6 ets3   Tech         <aggregated> Test    32.0    34.0   32.0    11.8   11.8 
##  7 ets3   <aggregated> <aggregated> Test    15.7    16.9   15.7    10.6   10.6 
##  8 md     Auto         GM           Test    16.9    17.2   16.9    47.6   47.6 
##  9 md     Auto         ford         Test     6.20    6.29   6.20   50.3   50.3 
## 10 md     Auto         <aggregated> Test    -0.557   2.28   1.90   -3.29   8.38
## 11 md     Tech         apple        Test    94.8    95.2   94.8    52.3   52.3 
## 12 md     Tech         microsoft    Test   203.    206.   203.     57.6   57.6 
## 13 md     Tech         <aggregated> Test    32.0    34.0   32.0    11.8   11.8 
## 14 md     <aggregated> <aggregated> Test  -113.    114.   113.    -78.7   78.7 
## # ℹ 3 more variables: MASE <dbl>, RMSSE <dbl>, ACF1 <dbl>

Model Statistics

glance(m1)

## # A tibble: 10 × 10
##    Stock        .model   sigma2 log_lik   AIC  AICc   BIC    MSE   AMSE    MAE
##    <chr*>       <chr>     <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl>
##  1 GM           ets    14.6       -156.  319.  319.  324.  14.0   36.7  3.03  
##  2 GM           bu     14.6       -156.  319.  319.  324.  14.0   36.7  3.03  
##  3 apple        ets     0.00534   -190.  391.  392.  400.  69.2  145.   0.0546
##  4 apple        bu      0.00534   -190.  391.  392.  400.  69.2  145.   0.0546
##  5 ford         ets     0.0139    -104.  214.  214.  219.   2.16   5.06 0.0880
##  6 ford         bu      0.0139    -104.  214.  214.  219.   2.16   5.06 0.0880
##  7 microsoft    ets     0.00305   -210.  431.  432.  440. 169.   357.   0.0413
##  8 microsoft    bu      0.00305   -210.  431.  432.  440. 169.   357.   0.0413
##  9 <aggregated> ets     0.00314   -171.  352.  354.  362.  30.9   68.5  0.0414
## 10 <aggregated> bu      0.00314   -171.  352.  354.  362.  30.9   68.5  0.0414

glance(m2)

## # A tibble: 14 × 11
##    Sector      Stock      .model  sigma2 log_lik   AIC  AICc   BIC    MSE   AMSE
##    <chr*>      <chr*>     <chr>    <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>
##  1 Auto        GM         ets2   1.46e+1   -156.  319.  319.  324.  14.0   36.7 
##  2 Auto        GM         td     1.46e+1   -156.  319.  319.  324.  14.0   36.7 
##  3 Auto        ford       ets2   1.39e-2   -104.  214.  214.  219.   2.16   5.06
##  4 Auto        ford       td     1.39e-2   -104.  214.  214.  219.   2.16   5.06
##  5 Auto        <aggregat… ets2   9.86e-3   -136.  279.  279.  284.   6.14  15.8 
##  6 Auto        <aggregat… td     9.86e-3   -136.  279.  279.  284.   6.14  15.8 
##  7 Tech        apple      ets2   5.34e-3   -190.  391.  392.  400.  69.2  145.  
##  8 Tech        apple      td     5.34e-3   -190.  391.  392.  400.  69.2  145.  
##  9 Tech        microsoft  ets2   3.05e-3   -210.  431.  432.  440. 169.   357.  
## 10 Tech        microsoft  td     3.05e-3   -210.  431.  432.  440. 169.   357.  
## 11 Tech        <aggregat… ets2   3.16e-3   -197.  405.  406.  414.  96.6  203.  
## 12 Tech        <aggregat… td     3.16e-3   -197.  405.  406.  414.  96.6  203.  
## 13 <aggregate… <aggregat… ets2   3.14e-3   -171.  352.  354.  362.  30.9   68.5 
## 14 <aggregate… <aggregat… td     3.14e-3   -171.  352.  354.  362.  30.9   68.5 
## # ℹ 1 more variable: MAE <dbl>

glance(m3)

## # A tibble: 14 × 11
##    Sector      Stock      .model  sigma2 log_lik   AIC  AICc   BIC    MSE   AMSE
##    <chr*>      <chr*>     <chr>    <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>
##  1 Auto        GM         ets3   1.46e+1   -156.  319.  319.  324.  14.0   36.7 
##  2 Auto        GM         md     1.46e+1   -156.  319.  319.  324.  14.0   36.7 
##  3 Auto        ford       ets3   1.39e-2   -104.  214.  214.  219.   2.16   5.06
##  4 Auto        ford       md     1.39e-2   -104.  214.  214.  219.   2.16   5.06
##  5 Auto        <aggregat… ets3   9.86e-3   -136.  279.  279.  284.   6.14  15.8 
##  6 Auto        <aggregat… md     9.86e-3   -136.  279.  279.  284.   6.14  15.8 
##  7 Tech        apple      ets3   5.34e-3   -190.  391.  392.  400.  69.2  145.  
##  8 Tech        apple      md     5.34e-3   -190.  391.  392.  400.  69.2  145.  
##  9 Tech        microsoft  ets3   3.05e-3   -210.  431.  432.  440. 169.   357.  
## 10 Tech        microsoft  md     3.05e-3   -210.  431.  432.  440. 169.   357.  
## 11 Tech        <aggregat… ets3   3.16e-3   -197.  405.  406.  414.  96.6  203.  
## 12 Tech        <aggregat… md     3.16e-3   -197.  405.  406.  414.  96.6  203.  
## 13 <aggregate… <aggregat… ets3   3.14e-3   -171.  352.  354.  362.  30.9   68.5 
## 14 <aggregate… <aggregat… md     3.14e-3   -171.  352.  354.  362.  30.9   68.5 
## # ℹ 1 more variable: MAE <dbl>

Analysis

For the training set, the middle-out and top-down approach produced identical model statistics where they outperformed the bottom-up model in terms of AIC, MSE and MAE. When it comes to training accuracy, the bottom-up aggregate model performed the worst while the middle-out aggregate model had the lowest RMSE. Although the middle-out and top-down model have similar training results, the middle-out model will be the best model because of its better performance on the test set.

Predictive Analytics - Week 5 (Hierarchical Time Series)

Lay Khoon Yu

2024-04-10