library(tidyverse)
## -- Attaching packages ---------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(mosaic)
## Loading required package: lattice
## Loading required package: ggformula
## Loading required package: ggstance
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median,
## prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
library(ggformula)
library(readr)
library(fpp2)
## Loading required package: forecast
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
## Loading required package: fma
## Loading required package: expsmooth
##DISCUSSION WEEK 4
## READ IN US GAS DATA
usgas <- read_csv("E:/WOODS/ADECXXXX/usgas.csv")
## Parsed with column specification:
## cols(
## Date = col_character(),
## GasBarrelsK = col_double()
## )
gas <- usgas$GasBarrelsK
gasts <- ts(gas, start = c(1945, 1), frequency = 12)
autoplot(gasts)

## Create Train and Test Data
## Create Test and Training Sets - split out 2018 onward, about 18 months
test <- window(gasts, start = 1945, end=c(2017,12))
train <- window(gasts, start =c(2018,1))
# Test for differencing
Box.test(diff(gasts), lag=5, type="Ljung-Box")
##
## Box-Ljung test
##
## data: diff(gasts)
## X-squared = 131.88, df = 5, p-value < 2.2e-16
#Given the L-B p-value is significant at several lag values, there is evidence of autocorrelation
cbind("ThousandBarrels" = gasts,
"Logs" = log(gasts),
"Seasonally\n diff logs" = diff(log(gasts),12),
"Doubly\n diff logs" = diff(diff(log(gasts),12),1)) %>% autoplot(facets=TRUE)

# The doubly differenced data appears the most stationary
# I expect gas consumption to display seasonality, so use the seasonal ARIMA
auto.arima(gasts)
## Series: gasts
## ARIMA(5,1,1)(1,1,2)[12]
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 sar1
## -1.0849 -0.7296 -0.2435 -0.1853 -0.1571 0.2887 -0.4737
## s.e. 0.1313 0.1123 0.0829 0.0497 0.0336 0.1301 0.2198
## sma1 sma2
## -0.2475 -0.5308
## s.e. 0.2037 0.1616
##
## sigma^2 estimated as 16335: log likelihood=-5539.32
## AIC=11098.63 AICc=11098.89 BIC=11146.47
fitgas <- Arima(gasts, order = c(5,1,1), seasonal = c(1,1,2))
# A forecast for the last 36 months shows an imperfect fit,with more variation in the seasonal spikes
fitgas %>% forecast(h=36) %>% autoplot()
## Warning: Removed 1 rows containing missing values (geom_path).

# THe residuals plots do not raise any concerning flags, however
checkresiduals(fitgas)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,1,1)(1,1,2)[12]
## Q* = 68.01, df = 15, p-value = 1.008e-08
##
## Model df: 9. Total lags used: 24
#COmpare ETS and ARIMA
etsgas <- ets(gasts)
## Warning in ets(gasts): Missing values encountered. Using longest contiguous
## portion of time series
etsgas
## ETS(A,Ad,A)
##
## Call:
## ets(y = gasts)
##
## Smoothing parameters:
## alpha = 0.242
## beta = 0.0145
## gamma = 0.072
## phi = 0.9751
##
## Initial states:
## l = 1451.6809
## b = 33.0415
## s = -82.3314 -112.8097 9.5379 -6.0319 306.5798 279.9161
## 312.3923 129.9725 22.5041 -112.235 -287.0032 -460.4915
##
## sigma: 140.4363
##
## AIC AICc BIC
## 14970.79 14971.57 15057.15
# When I forcast using the ETS for the same 36 month horizon, I do not see a marked difference from ARIMA
etsA <- gasts %>% ets() %>% forecast(h=36) %>% autoplot()
## Warning in ets(.): Missing values encountered. Using longest contiguous
## portion of time series
etsA

## Create Train and Test Data
## Create Test and Training Sets - split out 2018 onward, about 18 months
gastrain <- window(gasts, start = 1945, end=c(2017,12))
gastest <- window(gasts, start =c(2018,1))
(trainarima <- auto.arima(gastrain))
## Series: gastrain
## ARIMA(2,1,0)(1,1,2)[12]
##
## Coefficients:
## ar1 ar2 sar1 sma1 sma2
## -0.7966 -0.4860 -0.4702 -0.2485 -0.5310
## s.e. 0.0305 0.0309 0.1917 0.1775 0.1405
##
## sigma^2 estimated as 16681: log likelihood=-5424.95
## AIC=10861.9 AICc=10862 BIC=10890.46
checkresiduals(trainarima)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,0)(1,1,2)[12]
## Q* = 111.66, df = 19, p-value = 3.997e-15
##
## Model df: 5. Total lags used: 24
(trainets <- ets(gastrain))
## ETS(A,Ad,A)
##
## Call:
## ets(y = gastrain)
##
## Smoothing parameters:
## alpha = 0.2473
## beta = 0.0141
## gamma = 0.0826
## phi = 0.9779
##
## Initial states:
## l = 1467.0222
## b = 28.2339
## s = -67.8564 -105.3485 -43.0852 18.124 307.6296 254.2347
## 291.5958 143.9167 32.128 -104.0561 -301.2189 -426.0635
##
## sigma: 140.2058
##
## AIC AICc BIC
## 14614.38 14615.18 14700.34
checkresiduals(trainets)

##
## Ljung-Box test
##
## data: Residuals from ETS(A,Ad,A)
## Q* = 533.74, df = 7, p-value < 2.2e-16
##
## Model df: 17. Total lags used: 24
a1 <- trainarima %>% forecast(h=12*(2019-2017)+1) %>% accuracy(gasts)
a1[,c("RMSE", "MAE", "MAPE", "MASE")]
## RMSE MAE MAPE MASE
## Training set 127.8193 99.83868 1.760497 0.5212059
## Test set 120.5700 104.86746 1.126133 0.5474586
a2 <- trainets %>% forecast(h=12*(2019-2017)+1) %>% accuracy(gasts)
a2[,c("RMSE", "MAE", "MAPE", "MASE")]
## RMSE MAE MAPE MASE
## Training set 138.8387 107.6625 2.014391 0.562050
## Test set 134.7797 113.3080 1.224021 0.591522
# THe ARIMA model performs better on both the train and the test set.