Homework 2

library(knitr)
library(kableExtra)
library(ggplot2)
library(fpp2)
library(readxl)

Exercise 3.1

For the following series, find an appropriate Box-Cox transformation in order to stabilise the variance.

  • usnetelec

  • usgdp

  • mcopper

  • enplanements

usnetelec

head(usnetelec)
## Time Series:
## Start = 1949 
## End = 1954 
## Frequency = 1 
## [1] 296.1 334.1 375.3 403.8 447.0 476.3
summary(usnetelec)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   296.1   889.0  2040.9  1972.1  3002.7  3858.5

We clearly see that the frequency is yearly.

autoplot(usnetelec)

lambda <- BoxCox.lambda(usnetelec)
lambda
## [1] 0.5167714
autoplot(BoxCox(usnetelec,lambda))

usgdp

head(usgdp)
##        Qtr1   Qtr2   Qtr3   Qtr4
## 1947 1570.5 1568.7 1568.0 1590.9
## 1948 1616.1 1644.6
summary(usgdp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1568    2632    4552    5168    7130   11404
autoplot(usgdp)

lambda <- BoxCox.lambda(usgdp)
lambda
## [1] 0.366352
autoplot(BoxCox(usgdp,lambda))

mcopper

head(mcopper)
##        Jan   Feb   Mar   Apr   May   Jun
## 1960 255.2 259.7 249.3 258.0 244.3 246.8
summary(mcopper)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   216.6   566.0   949.2   997.8  1262.5  4306.0
autoplot(mcopper)

lambda <- BoxCox.lambda(mcopper)
lambda
## [1] 0.1919047
autoplot(BoxCox(mcopper,lambda))

enplanements

head(enplanements)
##        Jan   Feb   Mar   Apr   May   Jun
## 1979 21.12 22.92 25.90 24.38 23.41 26.82
summary(enplanements)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.14   27.18   34.88   35.67   42.78   56.14
autoplot(enplanements)

lambda <- BoxCox.lambda(enplanements)
lambda
## [1] -0.2269461
autoplot(BoxCox(enplanements,lambda))

Exercise 3.2

Why is a Box-Cox transformation unhelpful for the cangas data?

head(cangas)
##         Jan    Feb    Mar    Apr    May    Jun
## 1960 1.4306 1.3059 1.4022 1.1699 1.1161 1.0113
summary(cangas)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.966   6.453   8.831   9.777  14.429  19.528
autoplot(cangas)

lambda <- BoxCox.lambda(cangas)
lambda
## [1] 0.5767759
autoplot(BoxCox(cangas,lambda))

We can clearly see from the cangas data above that the Box-Cox transformation is not useful because the middle region has a high variability than the lower and the upper regions. There is no uniform increase or decrease in the variability of the data. In otherwords, the variance is not stable. The seasonal variation needs to be constant along the series. To bring stability to the variance, the lower region needs to increase variabilty, the middle region to descrease the variability and the upper region to increase and stretch the variability.

Exercise 3.3

What Box-Cox transformation would you select for your retail data?

retaildata <- read_excel("retail.xlsx", skip = 1)

myts <- ts(retaildata[, "A3349398A"], frequency = 12, start = c(1982, 4))
head(myts)
##        Apr   May   Jun   Jul   Aug   Sep
## 1982 408.7 404.9 401.0 414.4 403.8 411.8
autoplot(myts)

lambda <- BoxCox.lambda(myts)
lambda
## [1] 0.1231563
autoplot(BoxCox(myts,lambda))

This is a simple model and perfect one for forcasting with a low lambda (=0.1231563), which also straightened the trend line of the data.

Exercise 3.8

For your retail time series (from Exercise 3 in Section 2.10):

a. Split the data into two parts using

myts.train <- window(myts, end=c(2010,12))
myts.test <- window(myts, start=2011)

b. Check that your data have been split appropriately by producing the following plot

autoplot(myts) +
  autolayer(myts.train, series="Training") +
  autolayer(myts.test, series="Test")

c. Calculate forecasts using snaive applied to myts.train

fc <- snaive(myts.train)

d. Compare the accuracy of your forecasts against the actual values stored in myts.test.

accuracy(fc,myts.test)
##                     ME      RMSE       MAE      MPE     MAPE     MASE      ACF1
## Training set  73.94114  88.31208  75.13514 6.068915 6.134838 1.000000 0.6312891
## Test set     115.00000 127.92727 115.00000 4.459712 4.459712 1.530576 0.2653013
##              Theil's U
## Training set        NA
## Test set     0.7267171
fc2 <- meanf(myts.train)
fc3 <- rwf(myts.train, drift = TRUE)

autoplot(myts) +
  autolayer(fc2, series="Mean", PI=FALSE) +
  autolayer(fc3, series="Naïve", PI=FALSE) +
  autolayer(fc, series="Seasonal naïve", PI=FALSE) +
  guides(colour=guide_legend(title="Forecast"))

accuracy(fc,myts.test)
##                     ME      RMSE       MAE      MPE     MAPE     MASE      ACF1
## Training set  73.94114  88.31208  75.13514 6.068915 6.134838 1.000000 0.6312891
## Test set     115.00000 127.92727 115.00000 4.459712 4.459712 1.530576 0.2653013
##              Theil's U
## Training set        NA
## Test set     0.7267171
accuracy(fc2,myts.test)
##                        ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 1.103832e-13  638.0607  543.1396 -32.20817 58.11126  7.228836
## Test set     1.213556e+03 1216.2132 1213.5559  48.35666 48.35666 16.151644
##                   ACF1 Theil's U
## Training set  0.971582        NA
## Test set     -0.223985   10.0224
accuracy(fc3,myts.test)
##                         ME     RMSE       MAE         MPE      MAPE     MASE
## Training set  4.527233e-14 111.7486  76.00016  -0.4853011  5.767758 1.011513
## Test set     -5.263812e+02 532.1483 526.38116 -21.1167167 21.116717 7.005792
##                    ACF1 Theil's U
## Training set -0.3205577        NA
## Test set     -0.3054077  4.527626

e. Check the residuals

checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 671.41, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

Do the residuals appear to be uncorrelated and normally distributed?

The residuals appear to be correlated with each other. The residuals are not normally distributed, because they are not centered around 0 as there is a longer left tail.

How sensitive are the accuracy measures to the training/test split?

fit1 <- meanf(myts.train, h=24)
fit2 <- rwf(myts.train,h=24)
fit3 <- snaive(myts.train,h=24)
autoplot(myts) +
  autolayer(fit1, series="Mean", PI=FALSE) +
  autolayer(fit2, series="Naïve", PI=FALSE) +
  autolayer(fit3, series="Seasonal naïve", PI=FALSE) +
  xlab("Year") + ylab("Turnover") +
  ggtitle("Forecasts for Retail Turnover on Clothing") +
  guides(colour=guide_legend(title="Forecast"))

accuracy(fit1, myts.test)
##                        ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 1.103832e-13  638.0607  543.1396 -32.20817 58.11126  7.228836
## Test set     1.302878e+03 1313.6394 1302.8776  49.99667 49.99667 17.340457
##                   ACF1 Theil's U
## Training set 0.9715820        NA
## Test set     0.2066774  7.319144
accuracy(fit2, myts.test)
##                       ME     RMSE       MAE         MPE     MAPE     MASE
## Training set    7.509302 112.0006  76.29477   0.2792524  5.77901 1.015434
## Test set     -395.758333 429.8639 410.97500 -15.6801364 16.17306 5.469811
##                    ACF1 Theil's U
## Training set -0.3205577        NA
## Test set      0.2066774  2.432747
accuracy(fit3, myts.test)
##                     ME      RMSE       MAE      MPE     MAPE     MASE      ACF1
## Training set  73.94114  88.31208  75.13514 6.068915 6.134838 1.000000 0.6312891
## Test set     115.00000 127.92727 115.00000 4.459712 4.459712 1.530576 0.2653013
##              Theil's U
## Training set        NA
## Test set     0.7267171

The accuracy measure is always sensitive to the training/test split, given that there is a significant grows in trend and variability of the data. Based on the accuracy tables, the Mean based error measurements are more affected than the Percentage based measurements. Among these 3 methods for this dataset, the seasonal naive method is considered the best.