DATA 624 Homework 2

library(knitr)
library(rmdformats)

## Global options
options(max.print="31")
# opts_chunk$set(echo=FALSE,
#                cache=TRUE,
#                prompt=FALSE,
#                tidy=TRUE,
#                comment=NA,
#                message=FALSE,
#                warning=FALSE)
opts_knit$set(width=31)

library(fpp2)

## Loading required package: ggplot2

## Loading required package: forecast

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Loading required package: fma

## Loading required package: expsmooth

library(ggplot2)
library(data.table)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

# install.packages("precrec")
library(precrec)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
# install.packages("ROCit")
library(ROCit)

Question 3.1

For the following series, find an appropriate Box-Cox transformation in order to stabilise the variance.

usnetelec
usgdp
mcopper
enplanements

usnetelec

autoplot(usnetelec) + 
  ylab("billion kwh") +  
  xlab("Year") +
  ggtitle("Annual US Net Electricity Generation (billion kwh) for 1949 - 2003")

lambda.1 <- BoxCox.lambda(usnetelec)
lambda.1

## [1] 0.5167714

autoplot(BoxCox(usnetelec,lambda.1)) + ggtitle("Box Cox Transformation of Annual US Net Electricity Generation")

#str(usnetelec)
# finding min and max for the ts usnetelec
usnetelec[which.min(usnetelec)]

## [1] 296.1

usnetelec[which.max(usnetelec)]

## [1] 3858.5

# finding range for y for the ggplot object ggplot.1
ggplot.1 <- autoplot(BoxCox(usnetelec,lambda.1))
layer_scales(ggplot.1)$y$range$range

## [1]  34.69772 136.12246

Realized that the transformation based on the chosen value of lambda makes little difference to the forecasts but shrinks the prediction interval a great deal, i.e., from [296.1, 3858.5] to [34.69772, 136.12246]

usgdp

autoplot(usgdp) + 
  ylab("US Dollars") +  
  xlab("Year") +
  ggtitle("Quarterly US GDP")

lambda.2 <- BoxCox.lambda(usgdp)
lambda.2

## [1] 0.366352

autoplot(BoxCox(usgdp,lambda.2)) +  ggtitle("Box Cox Transformation of Quarterly US GDP")

# finding min and max for the ts usgdp
usgdp[which.min(usgdp)]

## [1] 1568

usgdp[which.max(usgdp)]

## [1] 11403.6

# finding range for y for the ggplot object ggplot.2
ggplot.2 <- autoplot(BoxCox(usgdp,lambda.2))
layer_scales(ggplot.2)$y$range$range

## [1] 37.70217 80.90904

The transformation based on the chosen value of lambda (lambda.2) makes little difference to the forecasts but shrinks the prediction interval a great deal, i.e., from [1568, 11403.6] to [37.70217, 80.90904]

mcopper

autoplot(mcopper) + 
  ylab("pounds per ton") +  
  xlab("Year") +
  ggtitle("Monthly Copper Price")

lambda.3 <- BoxCox.lambda(mcopper)
lambda.3

## [1] 0.1919047

plot3 <- autoplot(BoxCox(mcopper,lambda.3)) +  ggtitle("Box Cox Transformation of Monthly copper price")
ggplotly(plot3)

# finding min and max for the ts mcopper
mcopper[which.min(mcopper)]

## [1] 216.6

mcopper[which.max(mcopper)]

## [1] 4306

# finding range for y for the ggplot object ggplot.3
ggplot.3 <- autoplot(BoxCox(mcopper,lambda.3))
layer_scales(ggplot.3)$y$range$range

## [1]  9.415507 20.749415

The Monthly Copper Price time plot definitely shows an upward trend. Note that there has been a sharp increase since 2002 until it reaches its peak in 2006. There must be some variable that affecting it in such a dramatic way. As there is defintely a seasonal variation in monthly copper price, a lambda value is used to make the size of the seaonal variation about the same across the whole series. Again, the Box Cox transformation makes the prediction interval smaller, i.e., from [216.6, 4306] to [9.415507, 20.749415].

enplanements

autoplot(enplanements) + 
  ylab("millions") +  
  xlab("Year") +
  ggtitle("US Domestic Enplanements")

lambda.4 <- BoxCox.lambda(enplanements)
lambda.4

## [1] -0.2269461

plot4 <- autoplot(BoxCox(enplanements,lambda.4)) + ggtitle("Box Cox Transformation of US Domestic Enplanements")
ggplotly(plot4)

# finding min and max for the ts enplanements
enplanements[which.min(enplanements)]

## [1] 20.14

enplanements[which.max(enplanements)]

## [1] 56.14

# finding range for y for the ggplot object ggplot.4
ggplot.4 <- autoplot(BoxCox(enplanements,lambda.4))
layer_scales(ggplot.4)$y$range$range

## [1] 2.177250 2.639942

The time plot shows that there is seasonal component and an upward trend. The year of 2001 appears to peak and has a dramatic drop in domestic enplanements. The shrink in prediction intervals by the BoxCox transformations is from [20.14, 56.14] to [2.177250, 2.639942].

Question 3.2

Why is a Box-Cox transformation unhelpful for the cangas data?

autoplot(cangas) + 
  ylab("billion cubic metres") +  
  xlab("Year") +
  ggtitle("Monthly Canadian Gas Production")

As the seasonal variations throughout the time series is not uniform across the whole time series, you can’t really use the Box-Cox transformation to average out the variation throughout the series.

Question 3.3

What Box-Cox transformation would you select for your retail data (from Exercise 3 in Section 2.10)?

retaildata <- readxl::read_excel("retail.xlsx", skip=1)
myts <- ts(retaildata[,"A3349398A"],frequency=12, start=c(1982,4))
autoplot(myts) + ylab("Retail Food Sales") + ggtitle("New South Wales - Food Sales")

lambda.food <- BoxCox.lambda(myts)
lambda.food

## [1] 0.1231563

autoplot(BoxCox(myts,lambda.food)) +  ggtitle("Box Cox Transformation of Retail Food Sales in New South Wales")

The BoxCox.lambda function was used to choose a value for lambda to make the size of the seasonal variation constant. The value of lambda chosen is 0.12. The transformed time plot has more even seasonal variation throughout.

Question 3.8

Split the data into two parts using

myts.train <- window(myts, end=c(2010,12))
myts.test <- window(myts, start=2011)

Check that your data have been split appropriately by producing the following plot.

autoplot(myts) +
  autolayer(myts.train, series="Training") +
  autolayer(myts.test, series="Test")

Calculate forecasts using snaive applied to myts.train.

fc <- snaive(myts.train)

Compare the accuracy of your forecasts against the actual values stored in myts.test.

accuracy(fc,myts.test)

##                     ME      RMSE       MAE      MPE     MAPE     MASE      ACF1
## Training set  73.94114  88.31208  75.13514 6.068915 6.134838 1.000000 0.6312891
## Test set     115.00000 127.92727 115.00000 4.459712 4.459712 1.530576 0.2653013
##              Theil's U
## Training set        NA
## Test set     0.7267171

Check the residuals.

checkresiduals(fc)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 671.41, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

Do the residuals appear to be uncorrelated and normally distributed?

The residuals plot seem to show there is heteroskedasticity as there is no constant variance. But it is definitely normally distributed. The ACF plot does show there are autocorrelations in the series as there are more than 5% of the spikes lie outside of the dashed blue line. Thus the residuals are correlated.

How sensitive are the accuracy measures to the training/test split?

# original test split was performed on Jan 2011
autoplot(fc) + geom_vline(xintercept = 2011 , linetype="dotted", 
                color = "steelblue", size=1.5)

# Referenced codes provided by Donny Lofland

# Lets create different train/test splits to see how accuracy really varies
x <- c(2002:2012)
accur_train <- list()
accur_test <- list()
i <- 1
for (year in x) {
  myts.train <- window(myts, end=c(year,12))
  myts.test <- window(myts, start=year+1)
#  p <- autoplot(myts) +
#    autolayer(myts.train, series="Training") +
#    autolayer(myts.test, series="Test")
#  print(p)
  fc <- snaive(myts.train)
  forecast_obj <- accuracy(fc, myts.test)
  accur_train[[i]] <- forecast_obj[1,2]
  accur_test[[i]] <- forecast_obj[2,2]
  i <- i + 1
}
# Combine RMSE into a dataframe for convenience
df <- do.call(rbind, Map(data.frame, Year=x, RMSE_Train=accur_train, RMSE_Test=accur_test))
df <- df %>%
  select(Year, RMSE_Train, RMSE_Test) %>%
  gather(key='Group', value='RMSE', -Year)
# Plot RMSE for training and test data based on where we cut
ggplot(df, aes(x=Year, y=RMSE)) +
  geom_line(aes(color=Group, linetype=Group)) +
  scale_x_continuous(breaks = 0:2100) +
  scale_color_manual(values=c('darkred', 'steelblue'))

To answer the question of sensitivity (or accuracy) of the training/test split, I do see that Jan 2012, a year added in addition to the original split in 2011, attains the optimal accuracy with the lowest Root Mean Squared Error (RMSE) for the test set.