# load packages
library(readr)
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tidyr)
library(tsibble)
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(fable)
## Loading required package: fabletools
library(fabletools)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Data - Canada Unemployment Rate (1991-2022)
External Regressor : Inflation Rate
Obtained from World Bank Open Data
# import csv file
df <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/week_6/CND_INF_UNEMP.csv")
## Rows: 32 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): Inflation, Unemployment
## date (1): Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 3
## Date Inflation Unemployment
## <date> <dbl> <dbl>
## 1 1991-01-01 5.63 10.3
## 2 1992-01-02 1.49 11.2
## 3 1993-01-02 1.87 11.4
## 4 1994-01-03 0.166 10.4
## 5 1995-01-04 2.15 9.49
## 6 1996-01-05 1.57 9.62
# Convert dataset to time series
ts_data <- ts(df, start = c(1991-01-01), frequency = 1)
# plot time series of inflation and unemployment
autoplot(ts_data[,2:3], facets=TRUE) +
xlab("Year") + ylab("") +
ggtitle("Yearly Inflation and Unemployment Rate in Canada")
# train test split
train_df <-
df %>% filter(Date >= "1991-01-01" & Date <= "2015-01-19")
test_df <-
df %>% filter(Date >= "2016-01-20" & Date <= "2022-01-24")
# Create the train and test time series (for arima)
train_ts <- ts(train_df, start = (1991-01-01), frequency = 1)
test_ts <- ts(test_df, end = (2022-01-24), frequency = 1)
# build ARIMA model with inflation as external regressor
fit_arima <- auto.arima(train_ts[,3], xreg = train_ts[,2])
# generate summary of ARIMA model
summary(fit_arima)
## Series: train_ts[, 3]
## Regression with ARIMA(0,1,0) errors
##
## Coefficients:
## xreg
## -0.2460
## s.e. 0.0942
##
## sigma^2 = 0.3641: log likelihood = -21.42
## AIC=46.84 AICc=47.41 BIC=49.2
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1786102 0.5787639 0.4252343 -2.277182 5.304271 0.8497605
## ACF1
## Training set 0.2235584
arima_forecast <- forecast(fit_arima, xreg = test_ts[,2], h = 7)
autoplot(arima_forecast)
# build tsibble
train <- train_df |> as_tsibble(index = Date)
# build dynamic nn model
fit <- train |>
model(NNETAR(Unemployment~Inflation))
# build tsibble for test set
test <- test_df |> as_tsibble(index = Date)
# forecast 7 periods ahead with nn model and inflation from test set
nn_forecast <- fit |> forecast(test)
fit |>
forecast(test) |>
autoplot(train) +
labs(x = "Year", y = "Percent", title = "Yearly Unemployment in Canada")
# print report for nn model
report(eval(fit))
## Series: Unemployment
## Model: NNAR(2,2)
##
## Average of 20 networks, each of which is
## a 3-2-1 network with 11 weights
## options were - linear output units
##
## sigma^2 estimated as 0.07348
# NN training model
accuracy(fit)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NNETAR(Unemploymen… Trai… -0.00463 0.271 0.223 -0.239 2.98 0.445 0.405 -0.134
print("ARIMA training model")
## [1] "ARIMA training model"
accuracy(fit_arima)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1786102 0.5787639 0.4252343 -2.277182 5.304271 0.8497605
## ACF1
## Training set 0.2235584
print("NN Model Accuracy Metrics")
## [1] "NN Model Accuracy Metrics"
accuracy(test$Unemployment, nn_forecast$.mean)
## ME RMSE MAE MPE MAPE
## Test set 0.9488914 1.879731 1.468617 11.53377 18.14337
print("ARIMA Model Accuracy Metrics")
## [1] "ARIMA Model Accuracy Metrics"
accuracy(test$Unemployment, arima_forecast$mean)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -0.1926845 1.205691 0.9251877 -2.611508 13.73631 0.07328255 2.773599
The Dynamic Neural Network model has better performance than the ARIMA model in terms of their training error measures. However, the ARIMA model has lower ME, RMSE and MAPE when it comes to their accuracy metrics for the test set. The yearly unemployment time series does not have much complex patterns so the ARIMA model should be sufficient in this case. Therefore, I would conclude that the ARIMA model is the better model here.