Load Packages & Data

# load packages
library(readr)
library(ggplot2)
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(tidyr)
library(tsibble)

## 
## Attaching package: 'tsibble'

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(fable)

## Loading required package: fabletools

library(fabletools)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()       masks stats::filter()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag()          masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Data - Canada Unemployment Rate (1991-2022)
External Regressor : Inflation Rate
Obtained from World Bank Open Data

# import csv file
df <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/week_6/CND_INF_UNEMP.csv")

## Rows: 32 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (2): Inflation, Unemployment
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(df)

## # A tibble: 6 × 3
##   Date       Inflation Unemployment
##   <date>         <dbl>        <dbl>
## 1 1991-01-01     5.63         10.3 
## 2 1992-01-02     1.49         11.2 
## 3 1993-01-02     1.87         11.4 
## 4 1994-01-03     0.166        10.4 
## 5 1995-01-04     2.15          9.49
## 6 1996-01-05     1.57          9.62

Convert & Split Data

# Convert dataset to time series
ts_data <- ts(df, start = c(1991-01-01), frequency = 1)

# plot time series of inflation and unemployment
autoplot(ts_data[,2:3], facets=TRUE) +
  xlab("Year") + ylab("") +
  ggtitle("Yearly Inflation and Unemployment Rate in Canada")

# train test split
train_df <- 
  df %>% filter(Date >= "1991-01-01" & Date <= "2015-01-19")
test_df <- 
  df %>% filter(Date >= "2016-01-20" & Date <= "2022-01-24")

# Create the train and test time series (for arima)
train_ts <- ts(train_df, start = (1991-01-01), frequency = 1)
test_ts <- ts(test_df, end = (2022-01-24), frequency = 1)

Modeling

ARIMA Model

# build ARIMA model with inflation as external regressor
fit_arima <- auto.arima(train_ts[,3], xreg = train_ts[,2])

# generate summary of ARIMA model
summary(fit_arima)

## Series: train_ts[, 3] 
## Regression with ARIMA(0,1,0) errors 
## 
## Coefficients:
##          xreg
##       -0.2460
## s.e.   0.0942
## 
## sigma^2 = 0.3641:  log likelihood = -21.42
## AIC=46.84   AICc=47.41   BIC=49.2
## 
## Training set error measures:
##                      ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -0.1786102 0.5787639 0.4252343 -2.277182 5.304271 0.8497605
##                   ACF1
## Training set 0.2235584

arima_forecast <- forecast(fit_arima, xreg = test_ts[,2], h = 7)

autoplot(arima_forecast)

Dynamic Neural Network Model

# build tsibble
train <- train_df |> as_tsibble(index = Date)
# build dynamic nn model 
fit <- train |>
  model(NNETAR(Unemployment~Inflation))

# build tsibble for test set
test <- test_df |> as_tsibble(index = Date)
# forecast 7 periods ahead with nn model and inflation from test set
nn_forecast <- fit |> forecast(test)
fit |>
  forecast(test) |>
  autoplot(train) +
  labs(x = "Year", y = "Percent", title = "Yearly Unemployment in Canada")

# print report for nn model
report(eval(fit))

## Series: Unemployment 
## Model: NNAR(2,2) 
## 
## Average of 20 networks, each of which is
## a 3-2-1 network with 11 weights
## options were - linear output units 
## 
## sigma^2 estimated as 0.07348

Model Results & Accuracy Metrics

# NN training model
accuracy(fit)

## # A tibble: 1 × 10
##   .model              .type       ME  RMSE   MAE    MPE  MAPE  MASE RMSSE   ACF1
##   <chr>               <chr>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 NNETAR(Unemploymen… Trai… -0.00463 0.271 0.223 -0.239  2.98 0.445 0.405 -0.134

print("ARIMA training model")

## [1] "ARIMA training model"

accuracy(fit_arima)

##                      ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -0.1786102 0.5787639 0.4252343 -2.277182 5.304271 0.8497605
##                   ACF1
## Training set 0.2235584

print("NN Model Accuracy Metrics")

## [1] "NN Model Accuracy Metrics"

accuracy(test$Unemployment, nn_forecast$.mean)

##                 ME     RMSE      MAE      MPE     MAPE
## Test set 0.9488914 1.879731 1.468617 11.53377 18.14337

print("ARIMA Model Accuracy Metrics")

## [1] "ARIMA Model Accuracy Metrics"

accuracy(test$Unemployment, arima_forecast$mean)

##                  ME     RMSE       MAE       MPE     MAPE       ACF1 Theil's U
## Test set -0.1926845 1.205691 0.9251877 -2.611508 13.73631 0.07328255  2.773599

Analysis

The Dynamic Neural Network model has better performance than the ARIMA model in terms of their training error measures. However, the ARIMA model has lower ME, RMSE and MAPE when it comes to their accuracy metrics for the test set. The yearly unemployment time series does not have much complex patterns so the ARIMA model should be sufficient in this case. Therefore, I would conclude that the ARIMA model is the better model here.

Predictive Analytics - Week 6