Data 624 HW2

#Libraries
#install.packages(forecast)
library(forecast)
library(fpp2)
library(ggplot2)

QUESTION 3.1

For the following series, find an appropriate Box-Cox transformation in order to stabilise the variance.

usnetelec usgdp mcopper enplanements

USNETELEC TIME SERIES

lambda <- BoxCox.lambda(usnetelec)
lambda  # 0.5167714

## [1] 0.5167714

autoplot(usnetelec) + ggtitle('USNETELEC Time Series')

autoplot(BoxCox(usnetelec,lambda)) + ggtitle('USNETELEC with Box-Cox Transformation Lambda = 0.5167714')

USGDP TIME SERIES

lambda <- BoxCox.lambda(usgdp)
lambda  # 0.5167714

## [1] 0.366352

autoplot(usgdp) + ggtitle('USGDP Time Series')

autoplot(BoxCox(usgdp,lambda)) + ggtitle('USGDP with Box-Cox Transformation Lambda = 0.5167714')

MCOPPER TIME SERIES

lambda <- BoxCox.lambda(mcopper)
lambda  # 0.5167714

## [1] 0.1919047

autoplot(mcopper) + ggtitle('MCOPPER Time Series')

autoplot(BoxCox(mcopper,lambda)) + ggtitle('MCOPPER with Box-Cox Transformation Lambda = 0.5167714')

ENPLANEMENTS TIME SERIES

lambda <- BoxCox.lambda(enplanements)
lambda  # 0.5167714

## [1] -0.2269461

autoplot(enplanements) + ggtitle('ENPLANEMENTS Time Series')

autoplot(BoxCox(enplanements,lambda)) + ggtitle('ENPLANEMENTS with Box-Cox Transformation Lambda = 0.5167714')

QUESTION 3.2

Why is a Box-Cox transformation unhelpful for the cangas data?

The BoxCox transformation does not seem to have any effect on the variance. The variance of the cangas data is still changing with time after the tranformation, hence it is unhelpful.

## [1] 0.5767759

QUESTION 3.3

What Box-Cox transformation would you select for your retail data (from Exercise 3 in Section 2.10)?

The Box-Cox transformation I would select for the retail data would be a Lambda of 0.1276369

library(readxl)

## Warning: package 'readxl' was built under R version 3.4.4

retaildata <- readxl::read_excel("retail.xlsx", skip=1)

## readxl works best with a newer version of the tibble package.
## You currently have tibble v1.3.4.
## Falling back to column name repair from tibble <= v1.4.2.
## Message displays once per session.

#head(retaildata,5)

#choose time series from retail data
myts <- ts(retaildata[,"A3349873A"],frequency=12, start=c(1982,4))
autoplot(myts) + ggtitle('Myts Time Series')

lambda= BoxCox.lambda(myts)
lambda

## [1] 0.1276369

autoplot(BoxCox(myts,lambda)) + ggtitle('Myts with Box-Cox Transformation Lambda = 0.1276369')

QUESTION 3.8

For your retail time series (from Exercise 3 in Section 2.10):

Split the data into two parts using

Read in monthly Australian retail data.

library(readxl)

retaildata <- readxl::read_excel("retail.xlsx", skip=1)
head(retaildata,5)

## # A tibble: 5 x 190
##   `Series ID` A3349335T A3349627V A3349338X A3349398A A3349468W A3349336V
##        <dttm>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1  1982-04-01     303.1      41.7      63.9     408.7      65.8      91.8
## 2  1982-05-01     297.8      43.1      64.0     404.9      65.8     102.6
## 3  1982-06-01     298.0      40.3      62.7     401.0      62.3     105.0
## 4  1982-07-01     307.9      40.9      65.6     414.4      68.2     106.0
## 5  1982-08-01     299.2      42.1      62.6     403.8      66.0      96.9
## # ... with 183 more variables: A3349337W <dbl>, A3349397X <dbl>,
## #   A3349399C <dbl>, A3349874C <dbl>, A3349871W <dbl>, A3349790V <dbl>,
## #   A3349556W <dbl>, A3349791W <dbl>, A3349401C <dbl>, A3349873A <dbl>,
## #   A3349872X <dbl>, A3349709X <dbl>, A3349792X <dbl>, A3349789K <dbl>,
## #   A3349555V <dbl>, A3349565X <dbl>, A3349414R <dbl>, A3349799R <dbl>,
## #   A3349642T <dbl>, A3349413L <dbl>, A3349564W <dbl>, A3349416V <dbl>,
## #   A3349643V <dbl>, A3349483V <dbl>, A3349722T <dbl>, A3349727C <dbl>,
## #   A3349641R <dbl>, A3349639C <dbl>, A3349415T <dbl>, A3349349F <dbl>,
## #   A3349563V <dbl>, A3349350R <dbl>, A3349640L <dbl>, A3349566A <dbl>,
## #   A3349417W <dbl>, A3349352V <dbl>, A3349882C <dbl>, A3349561R <dbl>,
## #   A3349883F <dbl>, A3349721R <dbl>, A3349478A <dbl>, A3349637X <dbl>,
## #   A3349479C <dbl>, A3349797K <dbl>, A3349477X <dbl>, A3349719C <dbl>,
## #   A3349884J <dbl>, A3349562T <dbl>, A3349348C <dbl>, A3349480L <dbl>,
## #   A3349476W <dbl>, A3349881A <dbl>, A3349410F <dbl>, A3349481R <dbl>,
## #   A3349718A <dbl>, A3349411J <dbl>, A3349638A <dbl>, A3349654A <dbl>,
## #   A3349499L <dbl>, A3349902A <dbl>, A3349432V <dbl>, A3349656F <dbl>,
## #   A3349361W <dbl>, A3349501L <dbl>, A3349503T <dbl>, A3349360V <dbl>,
## #   A3349903C <dbl>, A3349905J <dbl>, A3349658K <dbl>, A3349575C <dbl>,
## #   A3349428C <dbl>, A3349500K <dbl>, A3349577J <dbl>, A3349433W <dbl>,
## #   A3349576F <dbl>, A3349574A <dbl>, A3349816F <dbl>, A3349815C <dbl>,
## #   A3349744F <dbl>, A3349823C <dbl>, A3349508C <dbl>, A3349742A <dbl>,
## #   A3349661X <dbl>, A3349660W <dbl>, A3349909T <dbl>, A3349824F <dbl>,
## #   A3349507A <dbl>, A3349580W <dbl>, A3349825J <dbl>, A3349434X <dbl>,
## #   A3349822A <dbl>, A3349821X <dbl>, A3349581X <dbl>, A3349908R <dbl>,
## #   A3349743C <dbl>, A3349910A <dbl>, A3349435A <dbl>, A3349365F <dbl>,
## #   A3349746K <dbl>, A3349370X <dbl>, ...

#choose time series from retail data
myts <- ts(retaildata[,"A3349873A"],frequency=12, start=c(1982,4))

#myts <- readxl::read_excel("retail.xlsx", skip=1)
myts.train <- window(myts, end=c(2010,12))
myts.test <- window(myts, start=2011)

Check that your data have been split appropriately by producing the following plot

autoplot(myts) +
  autolayer(myts.train, series="Training") +
  autolayer(myts.test, series="Test")

Calculate forecasts using snaive applied to myts.train.

fc <- snaive(myts.train)

Compare the accuracy of your forecasts against the actual values stored in myts.test.

accuracy(fc,myts.test)

##                     ME     RMSE      MAE       MPE      MAPE     MASE
## Training set  7.772973 20.24576 15.95676  4.702754  8.109777 1.000000
## Test set     55.300000 71.44309 55.78333 14.900996 15.082019 3.495907
##                   ACF1 Theil's U
## Training set 0.7385090        NA
## Test set     0.5315239  1.297866

Check the residuals. Do the residuals appear to be uncorrelated and normally distributed? In reviewing the below ACF plot and residuals distribution it appears that there is some correlation that exist and the risiduals does not seem to be normally distributed.

checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 624.45, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

How sensitive are the accuracy measures to the training/test split?

Based on the below accuracy of the train/test split compared to the first train/test spit above the selection of train and test data accuracy his very sensitive to the train/test split.

#myts <- readxl::read_excel("retail.xlsx", skip=1)
myts.train <- window(myts, end=c(2009,12))
myts.test <- window(myts, start=2010)
fc <- snaive(myts.train)
accuracy(fc,myts.test)

##                     ME     RMSE      MAE       MPE     MAPE     MASE
## Training set  8.989097 19.91337 15.62773  5.225943 8.065561 1.000000
## Test set     -5.695833 34.32162 27.13750 -3.330530 8.948578 1.736497
##                   ACF1 Theil's U
## Training set 0.6955129        NA
## Test set     0.6105826 0.9200031