library(tidyverse)
## -- Attaching packages ---------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(mosaic)
## Loading required package: lattice
## Loading required package: ggformula
## Loading required package: ggstance
## 
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh
## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
library(ggformula)
library(readr)
library(fpp2)
## Loading required package: forecast
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method             from    
##   fitted.fracdiff    fracdiff
##   residuals.fracdiff fracdiff
## Loading required package: fma
## Loading required package: expsmooth
##DISCUSSION WEEK 4
## READ IN US GAS DATA
usgas <- read_csv("E:/WOODS/ADECXXXX/usgas.csv")
## Parsed with column specification:
## cols(
##   Date = col_character(),
##   GasBarrelsK = col_double()
## )
gas <- usgas$GasBarrelsK
gasts <- ts(gas, start = c(1945, 1), frequency = 12)
autoplot(gasts)

## Create Train and Test Data
## Create Test and Training Sets - split out 2018 onward, about 18 months
test <- window(gasts, start = 1945, end=c(2017,12))
train <- window(gasts, start =c(2018,1))


# Test for differencing
Box.test(diff(gasts), lag=5, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  diff(gasts)
## X-squared = 131.88, df = 5, p-value < 2.2e-16
#Given the L-B p-value is significant at several lag values, there is evidence of autocorrelation

cbind("ThousandBarrels" = gasts, 
      "Logs" = log(gasts), 
      "Seasonally\n diff logs" = diff(log(gasts),12), 
      "Doubly\n diff logs" = diff(diff(log(gasts),12),1)) %>% autoplot(facets=TRUE)

# The doubly differenced data appears the most stationary

# I expect gas consumption to display seasonality, so use the seasonal ARIMA
auto.arima(gasts)
## Series: gasts 
## ARIMA(5,1,1)(1,1,2)[12] 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5     ma1     sar1
##       -1.0849  -0.7296  -0.2435  -0.1853  -0.1571  0.2887  -0.4737
## s.e.   0.1313   0.1123   0.0829   0.0497   0.0336  0.1301   0.2198
##          sma1     sma2
##       -0.2475  -0.5308
## s.e.   0.2037   0.1616
## 
## sigma^2 estimated as 16335:  log likelihood=-5539.32
## AIC=11098.63   AICc=11098.89   BIC=11146.47
fitgas <- Arima(gasts, order = c(5,1,1), seasonal = c(1,1,2))

# A forecast for the last 36 months shows an imperfect fit,with more variation in the seasonal spikes
fitgas %>% forecast(h=36) %>% autoplot()
## Warning: Removed 1 rows containing missing values (geom_path).

# THe residuals plots do not raise any concerning flags, however
checkresiduals(fitgas)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,1,1)(1,1,2)[12]
## Q* = 68.01, df = 15, p-value = 1.008e-08
## 
## Model df: 9.   Total lags used: 24
#COmpare ETS and ARIMA
etsgas <- ets(gasts)
## Warning in ets(gasts): Missing values encountered. Using longest contiguous
## portion of time series
etsgas
## ETS(A,Ad,A) 
## 
## Call:
##  ets(y = gasts) 
## 
##   Smoothing parameters:
##     alpha = 0.242 
##     beta  = 0.0145 
##     gamma = 0.072 
##     phi   = 0.9751 
## 
##   Initial states:
##     l = 1451.6809 
##     b = 33.0415 
##     s = -82.3314 -112.8097 9.5379 -6.0319 306.5798 279.9161
##            312.3923 129.9725 22.5041 -112.235 -287.0032 -460.4915
## 
##   sigma:  140.4363
## 
##      AIC     AICc      BIC 
## 14970.79 14971.57 15057.15
# When I forcast using the ETS for the same 36 month horizon, I do not see a marked difference from ARIMA
etsA <- gasts %>% ets() %>% forecast(h=36) %>% autoplot()
## Warning in ets(.): Missing values encountered. Using longest contiguous
## portion of time series
etsA

## Create Train and Test Data
## Create Test and Training Sets - split out 2018 onward, about 18 months
gastrain <- window(gasts, start = 1945, end=c(2017,12))
gastest <- window(gasts, start =c(2018,1))

(trainarima <- auto.arima(gastrain))
## Series: gastrain 
## ARIMA(2,1,0)(1,1,2)[12] 
## 
## Coefficients:
##           ar1      ar2     sar1     sma1     sma2
##       -0.7966  -0.4860  -0.4702  -0.2485  -0.5310
## s.e.   0.0305   0.0309   0.1917   0.1775   0.1405
## 
## sigma^2 estimated as 16681:  log likelihood=-5424.95
## AIC=10861.9   AICc=10862   BIC=10890.46
checkresiduals(trainarima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,0)(1,1,2)[12]
## Q* = 111.66, df = 19, p-value = 3.997e-15
## 
## Model df: 5.   Total lags used: 24
(trainets <- ets(gastrain))
## ETS(A,Ad,A) 
## 
## Call:
##  ets(y = gastrain) 
## 
##   Smoothing parameters:
##     alpha = 0.2473 
##     beta  = 0.0141 
##     gamma = 0.0826 
##     phi   = 0.9779 
## 
##   Initial states:
##     l = 1467.0222 
##     b = 28.2339 
##     s = -67.8564 -105.3485 -43.0852 18.124 307.6296 254.2347
##            291.5958 143.9167 32.128 -104.0561 -301.2189 -426.0635
## 
##   sigma:  140.2058
## 
##      AIC     AICc      BIC 
## 14614.38 14615.18 14700.34
checkresiduals(trainets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,Ad,A)
## Q* = 533.74, df = 7, p-value < 2.2e-16
## 
## Model df: 17.   Total lags used: 24
a1 <- trainarima %>% forecast(h=12*(2019-2017)+1) %>% accuracy(gasts)
a1[,c("RMSE", "MAE", "MAPE", "MASE")]
##                  RMSE       MAE     MAPE      MASE
## Training set 127.8193  99.83868 1.760497 0.5212059
## Test set     120.5700 104.86746 1.126133 0.5474586
a2 <- trainets %>% forecast(h=12*(2019-2017)+1) %>% accuracy(gasts)
a2[,c("RMSE", "MAE", "MAPE", "MASE")]
##                  RMSE      MAE     MAPE     MASE
## Training set 138.8387 107.6625 2.014391 0.562050
## Test set     134.7797 113.3080 1.224021 0.591522
# THe ARIMA model performs better on both the train and the test set.