library(readxl)
library(dplyr)
library(seasonal)
library(ggplot2)
library(forecast)
library(tidyverse)
library(tsibble)
library(feasts)
library(lubridate)
library(tsibbledata)
library(fable)
library(zoo)
library(fst)
library(fpp3)
library(fabletools)
library(patchwork)
library(ZIM)
vicD <- aus_livestock %>%
dplyr::filter(Animal == "Pigs", State == "Victoria") %>%
select(-Animal, -State) %>%
mutate(Month = yearmonth(as.Date(Month, format="%Y/%U"))) %>%
as_tsibble(index = Month)
fit <- vicD %>%
model(ETS(Count))
report(fit)
Series: Count
Model: ETS(A,N,A)
Smoothing parameters:
alpha = 0.3579401
gamma = 0.0001000139
Initial states:
l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
95487.5 2066.235 6717.311 -2809.562 -1341.299 -7750.173 -8483.418 5610.89
s[-7] s[-8] s[-9] s[-10] s[-11]
-579.8107 1215.464 -2827.091 1739.465 6441.989
sigma^2: 60742898
AIC AICc BIC
13545.38 13546.26 13610.24
# alpha = 0.358
# l0 = 95487.5
fc <- fit %>% forecast(h = 4) %>% hilo()
info <- fit %>% augment()
fit %>%
forecast(h=4) %>%
autoplot(vicD[c(450:nrow(vicD)),c(1,2)])
# 95% CI
s <- sd(info$.resid)
low <- fc$.mean[1] - 1.96*(s)
high <- fc$.mean[1] + 1.96*(s)
low
[1] 69328.25
high
[1] 99521.17
fc$`95%`[1]
<hilo[1]>
[1] [69149.19, 99700.22]95
y = ts(vicD$Count)
alpha = 0.358
level = 95487.5
# 8.1
smooth.fun <- function(y, alpha, l0){
y_hat = l0
for(i in 1:length(y)){
y_hat = alpha*y[i] + (1 - alpha)*y_hat
}
return(y_hat)
}
smooth.fun(y, alpha, level)
[1] 95128.59
# Not exactly the same as ETS but very close.
y = ts(vicD$Count)
alpha = 0.358
l0 = 95487.5
# 8.1
sse.fun <- function(y, alpha, l0){
y_hat = l0
sse = 0
for(i in 1:length(y)){
e = y[i] - y_hat
sse <- sse + e^2
y_hat = alpha*y[i] + (1 - alpha)*y_hat
}
return(sse)
}
sse.fun(y, alpha, level)
[1] 48771790412
#optim(par = c(y, alpha, l0), fn = ETS)
#optim(par = c(y, alpha, l0), sse.fun)
#optim(par = c(vicD$Count, alpha, vicD$Count[1]),
# fn = sse.fun,
# data = vicD)
#optim(par = alpha, l0,y=y, sse.fun)
y_hat = level
sse = 0
for(i in 1:length(y)){
y_hat = alpha*y[i] + (1 - alpha)*y_hat
e = y[i] - y_hat
sse <- sse + e^2
}
#return(sse)
ghana <- global_economy %>%
filter(Country == "Ghana") %>%
select(Year, Exports) %>%
as_tsibble(index = Year)
autoplot(ghana)+
labs(title = "Ghana Exports")
ts1 <- ts(ghana$Exports, frequency = 13, start = 1960)
f1 <- forecast(
ts1,
method = c("ets"),
etsmodel = c("ANN"),
h = 6
)
plot(f1)
# training and test data
tr.lim <- as.integer(ghana[round(0.8 * nrow(ghana)), 1])
train <- ghana %>% filter_index(. ~tr.lim)
# ANN
myets = ets(train$Exports, model="ANN")
# AAN
myets2 = ets(train$Exports, model="AAN")
# Comparison
summary(myets)
ETS(A,N,N)
Call:
ets(y = train$Exports, model = "ANN")
Smoothing parameters:
alpha = 0.9867
Initial states:
l = 28.1727
sigma: 3.9135
AIC AICc BIC
305.6004 306.1719 311.0864
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 0.1831829 3.827476 2.776018 -1.974515 16.95663 0.9799308
ACF1
Training set -0.004263608
summary(myets2)
ETS(A,A,N)
Call:
ets(y = train$Exports, model = "AAN")
Smoothing parameters:
alpha = 0.987
beta = 1e-04
Initial states:
l = 28.6786
b = 0.1588
sigma: 4.0025
AIC AICc BIC
309.5303 311.0303 318.6735
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 0.0122789 3.824559 2.80542 -3.051296 17.17767 0.9903099
ACF1
Training set -0.002742442
#
# Both models perform very similarly on the training set. The ANN model outperforms
# the AAN model on the margins. It has better measures for AIC, AICc and BIC. The same holds
# when comparing training set model errors.
#
# Forecasts
myfit <- train %>% # ANN
model(ETS(Exports~ error("A") + trend("N") + season("N")))
myfc <- forecast(myfit, h = 12)
myfc %>% autoplot(train)
myfit2 <- train %>% # AAN
model(ETS(Exports~ error("A") + trend("A") + season("N")))
myfc2 <- forecast(myfit2, h = 12)
myfc2 %>% autoplot(train)
bind_rows(
myfit %>% accuracy(), # ANN
myfit2 %>% accuracy(), # AAN
myfit %>% forecast(h = 10) %>% accuracy(ghana),
myfit2 %>% forecast(h = 10) %>% accuracy(ghana)
) %>%
select(-.model, -ME, -MPE, -ACF1)
# A tibble: 4 x 6
.type RMSE MAE MAPE MASE RMSSE
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Training 3.83 2.78 17.0 0.980 0.989
2 Training 3.82 2.81 17.2 0.990 0.988
3 Test 8.43 7.61 28.1 2.69 2.18
4 Test 9.04 8.17 30.2 2.88 2.34
#
#
# The ANN model performs better on both the training and testing data. I think an upward
# trend in predicted values with the AAN model wouldnt be a bad bet, but then again the trend
# isnt certain.
#
myfc2 %>% autoplot(ghana)
myfc %>% autoplot(ghana)
myfc %>% autoplot(ghana) +
geom_line(aes(y = .fitted), col = "blue",
data = augment(myfit))
# Confidence Interval
rmse <- sd(myets$residuals)
lower <- fc$mean[1] - 1.96*rmse
upper <- fc$mean[1] + 1.96*rmse
fc$lower[1,2]
NULL
fc$upper[1,2]
NULL
china <- global_economy %>%
filter(Country == "China") %>%
select(Year, GDP) %>%
as_tsibble()
autoplot(china) +
labs(title = "Chinese GDP")
p1 <- china %>%
model(
`Holt's method` = ETS(GDP~ error("A") +
trend("A") + season("N")),
`Damped` = ETS(GDP ~ error("A") + trend("Ad") + season("N")),
`Damped Holt's method` = ETS(GDP~ error("A") +
trend("Ad", phi=0.9) + season("N"))
) %>%
forecast(h = 30) %>%
autoplot(china, level = NULL)
p1
china %>%
stretch_tsibble(.init = 10) %>%
model(
`Holt's method` = ETS(GDP~ error("A") +
trend("A") + season("N")),
`Damped` = ETS(GDP ~ error("A") + trend("Ad") + season("N")),
`Damped Holt's method` = ETS(GDP~ error("A") +
trend("Ad", phi=0.9) + season("N"))
) %>%
forecast(h = 1) %>%
accuracy(china)
# A tibble: 3 x 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Damped Test 2.68e10 2.41e11 1.30e11 3.11 7.58 0.601 0.575 -0.108
2 Damped Holt's me~ Test 5.28e10 2.33e11 1.22e11 3.66 7.80 0.563 0.557 -0.167
3 Holt's method Test 2.23e10 2.36e11 1.28e11 2.97 7.53 0.590 0.563 -0.112
# Damped Holts method best when comparing RMSE and MAE values
# Box-Cox
lam = china %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
china.bc <- china %>%
mutate(GDP = box_cox(GDP, lam)) %>%
as_tsibble(index = Year)
# No transformation
china %>%
model(
`Holt's method` = ETS(GDP~ error("A") +
trend("A") + season("N")),
`Damped` = ETS(GDP ~ error("A") + trend("Ad") + season("N")),
`Damped Holt's method` = ETS(GDP~ error("A") +
trend("Ad", phi=0.9) + season("N")),
`Box-Cox2` = ETS(log(GDP)~ drift())
) %>%
forecast(h = 50) %>%
autoplot(china, level = 40)
# Box-cox tranformation
china.bc %>%
model(
`Holt's method` = ETS(GDP~ error("A") +
trend("A") + season("N")),
`Damped` = ETS(GDP ~ error("A") + trend("Ad") + season("N")),
`Damped Holt's method` = ETS(GDP~ error("A") +
trend("Ad", phi=0.9) + season("N")),
`Box-Cox` = ETS(log(GDP)~ drift())
) %>%
forecast(h = 50) %>%
autoplot(china.bc, level = 40)
# No transformation
china %>%
model(
`test` = RW(log(GDP) ~ drift())) %>%
forecast(h = 15) %>%
autoplot(china, level = 40, point_forecast = lst(mean, median))
# Box-cox tranformation
china.bc %>%
model(
`test` = RW(log(GDP) ~ drift())) %>%
forecast(h = 2000) %>%
autoplot(china.bc, level = 40, point_forecast = lst(mean, median))
ausgas <- aus_production %>%
select(Quarter, Gas) %>%
index_by(Quarter) %>%
as_tsibble(index = Quarter)
train <- ausgas %>% filter_index(. ~ "2007 Q1")
gg_tsdisplay(ausgas)
#
# multiplicative is necessary here because the seasonal trend is steadily
# increasing with time.
ausgas %>%
filter_index(. ~"2007 Q1") %>%
model(
`Holt's method` = ETS(Gas~ error("M") + trend("A") + season("M")),
`Damped Holt's method` = ETS(Gas~ error("M") +
trend("Ad", phi = 0.9) + season("M"))
) %>%
forecast(h = 13) %>%
autoplot(ausgas, level = NULL)
myfit <- train %>%
model(
ETS(Gas ~ error("M") + trend("A") + season("M"))
)
myfc <- forecast(myfit, h = 13)
myfit2 <- train %>%
model(
ETS(Gas ~ error("M") + trend("Ad", phi = 0.9) + season("M"))
)
myfc2 <- forecast(myfit2, h = 13)
bind_rows(
myfit %>% accuracy(), # Holts method
myfit2 %>% accuracy(), # Damped Holts method
myfit %>% forecast(h = 13) %>% accuracy(ausgas),
myfit2 %>% forecast(h = 13) %>% accuracy(ausgas)
) %>%
select(-.model, -ME, -MPE, -ACF1)
# A tibble: 4 x 6
.type RMSE MAE MAPE MASE RMSSE
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Training 4.49 2.91 4.21 0.528 0.592
2 Training 4.46 2.92 4.30 0.530 0.587
3 Test 12.7 10.6 4.67 1.93 1.68
4 Test 9.17 7.76 3.38 1.41 1.21
# Damped method outperforms non-damped method on test data. It is too close to call
# on training data.
tourism %>%
features(Trips, feat_stl) %>%
ggplot(aes(x = trend_strength, y = seasonal_strength_year,
col = Purpose)) +
geom_point() +
facet_wrap(vars(State))
data <- tourism %>%
index_by(Quarter) %>%
summarize(Trips = sum(Trips))%>%
as_tsibble(index = Quarter)
train <- data %>% filter(Quarter <= data[.95*nrow(data),1])
autoplot(data) +
labs(title = "Australian domestic overnight trips")
# STL decomposition
mystl <- train %>%
model(
STL(Trips ~ trend(window = 7) +
season(window = "periodic"),
robust = TRUE)
) %>%
components()
autoplot(mystl)
dcmp <- train %>%
model(STL(Trips ~ trend(window = 7)))
components(dcmp) %>%
as_tsibble() %>%
autoplot(Trips, colour = "gray") +
geom_line(aes(y=season_adjust), colour = "#0072B2") +
labs(y = "Persons (thousands)",
title = "Total Trips (seasonally adjusted")
# seasonally adjusted data
sadj <- mystl %>%
select(Quarter, season_adjust) %>%
mutate(Trips = season_adjust) %>%
as_tsibble(index = Quarter)
sadj <- mystl %>%
#index_by(Quarter) %>%
summarize(Trips = season_adjust) %>%
as_tsibble(index = Quarter)
# additive damped trend method
fit1 <- sadj %>%
model(
ETS(Trips ~ error("A") + trend("Ad") + season("N"))
)
fc <- fit1 %>% forecast(h = "2 years")
fc %>%
autoplot(data)
# Holts linear method
fit2 <- train %>%
model(
ETS(Trips ~ error("A") + trend("A") + season("N"))
)
fc2 <- fit2 %>% forecast(h = "2 years")
fc2 %>%
autoplot(data)
# ETS choosing type
fit3 <- train %>%
model(ETS(Trips ~ error("A") + trend("A") + season("A")))
fit3
# A mable: 1 x 1
`ETS(Trips ~ error("A") + trend("A") + season("A"))`
<model>
1 <ETS(A,A,A)>
fc3 <- fit3 %>% forecast(h = "2 years")
fc3 %>% autoplot(data)
# none for both seasonal and trend
bind_rows(
fit1 %>% accuracy(), # AAdN
fit2 %>% accuracy(), # AAN
fit3 %>% accuracy(), # AAA
fit1 %>% forecast(h = "2 years") %>% accuracy(data),
fit2 %>% forecast(h = "2 years") %>% accuracy(data),
fit3 %>% forecast(h = "2 years") %>% accuracy(data)
) %>%
select(-.model, -ME, -MPE, -ACF1)
# A tibble: 6 x 6
.type RMSE MAE MAPE MASE RMSSE
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Training 809. 624. 2.96 0.681 0.683
2 Training 1234. 1017. 4.77 1.11 1.04
3 Training 810. 623. 2.96 0.680 0.684
4 Test 886. 659. 2.40 0.720 0.747
5 Test 781. 618. 2.26 0.674 0.659
6 Test 511. 393. 1.45 0.430 0.431
# ETS type AAA is much better on testing data. However, it does not perform
# differently than the AAdN model with the training data.
# fit3 residuals
gg_tsresiduals(fit3)
data <- aus_arrivals %>%
filter(Origin == "NZ") %>%
filter_index(. ~ "2010 Q3") %>%
filter_index("1981 Q1" ~ .) %>%
summarize(Arrivals = Arrivals) %>%
as_tsibble(index = Quarter)
gg_tsdisplay(data)
autoplot(data)+
labs(title = "Arrivals in Australia, from New Zealand")
# training data
tail(data[1],1)
# A tsibble: 1 x 1 [1Q]
Quarter
<qtr>
1 2010 Q3
train <- data %>% filter_index(.~ "2008 Q3")
# Holt-Winters' multiplicative method
myfit <- train %>%
model(
ETS(Arrivals~ error("M") + trend("A") + season("M"))
)
report(myfit)
Series: Arrivals
Model: ETS(M,A,M)
Smoothing parameters:
alpha = 0.6536363
beta = 0.00182446
gamma = 0.2255423
Initial states:
l[0] b[0] s[0] s[-1] s[-2] s[-3]
75563.52 2013.136 0.9978774 1.227715 1.104459 0.6699478
sigma^2: 0.0088
AIC AICc BIC
2630.341 2632.123 2654.726
myfc <- myfit %>% forecast(h = "2 years")
myfc %>% autoplot(data)
#
# multiplicative seasonality is necessary because the seasonal variation is
# increasing over time. Without it, the models predictions would eventually be
# unable to keep up with the actual values.
#
# ets
myets <- train %>%
model(
ETS(Arrivals)
)
# seasonal NAIVE
mysnaive <- train %>%
model(
SNAIVE(Arrivals ~ lag("year"))
)
# additive ETS applied to log transformed series
log.ets <- train %>%
mutate(Arrivals = log(Arrivals)) %>%
model(
ETS(Arrivals ~ error("A") + trend("A") + season("A"))
)
# STL dcmp applied to log trnsfrm, then ETS model on sadj data
stl.dcmp <- data %>%
mutate(Arrivals = log(Arrivals)) %>%
model(
STL(Arrivals ~ trend(window = 7) +
season(window = "periodic"),
robust = TRUE)
) %>%
components()
stl.dcmp <- stl.dcmp %>%
summarize(Arrivals = season_adjust) %>%
as_tsibble(index = Quarter)
stl.train <- stl.dcmp %>% filter_index(.~ "2008 Q3")
stl.ets <- stl.train %>%
model(
ETS(Arrivals)
)
# Compiled
fc.myets <- myets %>% forecast(h = "2 years")
fc.mysnaive <- mysnaive %>% forecast(h = "2 years")
fc.log.ets <- log.ets %>% forecast(h = "2 years")
fc.stl.ets <- stl.ets %>% forecast(h = "2 years")
bind_rows(
myets %>% accuracy(), # ETS
mysnaive %>% accuracy(), # SNAIVE
log.ets %>% accuracy(), # Log ETS
stl.ets %>% accuracy(), # Log ETS on Seasonally Decomposed data
myets %>% forecast(h = "2 years") %>% accuracy(data),
mysnaive %>% forecast(h = "2 years") %>% accuracy(data),
log.ets %>% forecast(h = "2 years") %>% accuracy(stl.dcmp),
stl.ets %>% forecast(h = "2 years") %>% accuracy(stl.dcmp),
) %>%
select(-.model, -ME, -MPE, -ACF1)
# A tibble: 8 x 6
.type RMSE MAE MAPE MASE RMSSE
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Training 11283. 8939. 6.92 0.578 0.566
2 Training 19923. 15469. 11.2 1 1
3 Training 0.0894 0.0699 0.602 0.595 0.602
4 Training 0.0898 0.0691 0.593 0.588 0.604
5 Test 7693. 6626. 2.31 0.428 0.386
6 Test 10485. 7471. 2.49 0.483 0.526
7 Test 0.166 0.142 1.13 1.21 1.12
8 Test 0.0508 0.0445 0.355 0.379 0.342
#
#
# Between the models that underwent transformations, the log transformed ETS model that
# used the decomposed data well outperforms the model that only log transformed its data.
# It is not clear which of these models performs better on the training data, but the
# difference on the test data is significant.
#
#
# Comparing the models that did not undergo any transformations, the ETS model outperforms
# the seasonal NAIVE model by a wide margin for both the training and test data.
#
stl.ets
# A mable: 1 x 1
`ETS(Arrivals)`
<model>
1 <ETS(A,N,A)>
myets
# A mable: 1 x 1
`ETS(Arrivals)`
<model>
1 <ETS(M,A,M)>
gg_tsresiduals(stl.ets)
gg_tsresiduals(myets)
# both models pass the residual test.
#
# the decomp model has auto correlations slightly closer to the 95% limit and its
# residual histogram is less centered around 0 than the regular ets.
#
#
fc.stl.ets %>% autoplot(stl.dcmp)
fc.myets %>% autoplot(data)
# the regular ets model makes better predictions.
# non-transformation model comparisons
data %>%
stretch_tsibble(.init = 10) %>%
model(
`ETS` = ETS(Arrivals),
`SNAIVE` = SNAIVE(Arrivals)
) %>%
forecast(h = 1) %>%
accuracy(data) %>%
select(-.type, -ME, -RMSSE, -ACF1)
# A tibble: 2 x 6
.model RMSE MAE MPE MAPE MASE
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ETS 12685. 10006. 1.03 7.34 0.673
2 SNAIVE 19670. 15155. 5.05 10.3 1.02
# transformation model comparisons
stl.dcmp %>%
stretch_tsibble(.init = 10) %>%
model(
`STL ETS` = ETS(Arrivals),
`Log ETS` = ETS(log(Arrivals)~ error("A") +
trend("A") + season("A"))
) %>%
forecast(h=1) %>%
accuracy(stl.dcmp) %>%
select(-.type, -ME, -RMSSE, -ACF1)
# A tibble: 2 x 6
.model RMSE MAE MPE MAPE MASE
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Log ETS 0.109 0.0813 -0.0114 0.692 0.733
2 STL ETS 0.101 0.0727 0.205 0.620 0.655
#
# The better performing models are the same as before.
# The ETS outperforms the SNAIVE.
# The STL dcmp ETS outperforms the Log ETS
#
data <- aus_production %>%
select(Quarter, Cement) %>%
as_tsibble(index = Quarter)
find <- data %>%
stretch_tsibble(.init = 5, .step = 1) %>%
model(
`ETS` = ETS(Cement),
`SNAIVE` = SNAIVE(Cement)
) %>%
forecast(h = 4) %>%
accuracy(data) %>%
select(-.type, -ME, -RMSSE, -ACF1)
# The MSE has a smaller MSE value than the SNAIVE. All other measures of
# error are smaller for the MSE than with SNAIVE.
#
# good way to choose best forecasting model is to use model with smallest RMSE
# computed using time series cross-validation.
#
# https://community.rstudio.com/t/compare-original-x-seasonally-adjusted-data/99517/5
# BEER
beer.data <- aus_production %>%
select(Quarter, Beer) %>%
as_tsibble(index = Quarter)
train <- beer.data %>%
filter_index(.~ "2007 Q1")
my_dcmp_spec <- decomposition_model(
STL(log(Beer) ~ season(window = Inf)),
ETS(season_adjust ~ season("N")), SNAIVE(season_year)
)
myfit <- train %>%
model(stl_ets = my_dcmp_spec) %>%
forecast(h = "3 years") %>%
accuracy(beer.data)
myfit2 <- train %>%
model(
ETS(Beer),
SNAIVE(Beer)
) %>%
forecast(h = "3 years") %>%
accuracy(beer.data)
compare <- rbind(myfit, myfit2)
compare
# A tibble: 3 x 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 stl_ets Test -4.08 15.0 13.4 -0.817 3.11 0.842 0.763 0.145
2 ETS(Beer) Test -2.12 9.16 8.06 -0.506 1.92 0.507 0.464 0.322
3 SNAIVE(Beer) Test -2.92 10.1 8.75 -0.642 2.08 0.550 0.510 0.294
train %>%
model(
ETS(Beer)
) %>%
forecast(h = "3 years") %>%
autoplot(beer.data)
# BRICKS -- STL ETS best
bricks.data <- aus_production %>%
select(Quarter, Bricks) %>%
as_tsibble(index = Quarter) %>%
drop_na()
train <- bricks.data %>%
filter_index(.~ "2002 Q1")
my_dcmp_spec <- decomposition_model(
STL(log(Bricks) ~ season(window = Inf)),
ETS(season_adjust ~ season("N")), SNAIVE(season_year)
)
myfit <- train %>%
model(stl_ets = my_dcmp_spec) %>%
forecast(h = "3 years")
fc <- myfit %>% accuracy(bricks.data)
myfit2 <- train %>%
model(
ETS(Bricks),
SNAIVE(Bricks)
) %>%
forecast(h = "3 years") %>%
accuracy(bricks.data)
compare <- rbind(fc, myfit2)
compare
# A tibble: 3 x 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 stl_ets Test 25.1 29.3 25.1 6.09 6.09 0.695 0.594 0.254
2 ETS(Bricks) Test 28.4 32.1 28.4 6.91 6.91 0.787 0.650 0.247
3 SNAIVE(Bricks) Test 47.1 52.3 47.1 11.4 11.4 1.30 1.06 0.217
myfit %>% autoplot(bricks.data)
# DIABETES -- STL ETS best
diab <- PBS %>%
index_by(Month) %>%
filter(ATC2 == "A10") %>%
summarize(Cost = sum(Cost)) %>%
as_tsibble(index = Month)
train <- diab %>%
filter_index(.~ "2005 May")
my_dcmp_spec <- decomposition_model(
STL(log(Cost) ~ season(window = Inf)),
ETS(season_adjust ~ season("N")), SNAIVE(season_year)
)
myfit <- train %>%
model(stl_ets = my_dcmp_spec) %>%
forecast(h = "3 years")
fc <- myfit %>% accuracy(diab)
myfit2 <- train %>%
model(
ETS(Cost),
SNAIVE(Cost)
) %>%
forecast(h = "3 years") %>%
accuracy(diab)
compare <- rbind(fc, myfit2)
compare
# A tibble: 3 x 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 stl_ets Test 517115. 1688903. 1434384. 1.84 7.16 1.47 1.44 -0.116
2 ETS(Cost) Test 1835076. 2722255. 2153117. 8.04 10.0 2.21 2.32 0.287
3 SNAIVE(Cost) Test 4357626. 5218219. 4370165. 20.1 20.2 4.48 4.45 0.676
myfit %>% autoplot(diab)
# CORTICOSTEROIDS -- SNAIVE best
roids <- PBS %>%
index_by(Month) %>%
filter(ATC2 == "H02") %>%
summarize(Cost = sum(Cost)) %>%
as_tsibble(index = Month)
train <- roids %>%
filter_index(.~ "2005 May")
my_dcmp_spec <- decomposition_model(
STL(log(Cost) ~ season(window = Inf)),
ETS(season_adjust ~ season("N")), SNAIVE(season_year)
)
myfit <- train %>%
model(stl_ets = my_dcmp_spec) %>%
forecast(h = "3 years")
fc <- myfit %>% accuracy(roids)
myfit2 <- train %>%
model(
ETS(Cost),
SNAIVE(Cost)
) %>%
forecast(h = "3 years") %>%
accuracy(roids)
compare <- rbind(fc, myfit2)
compare
# A tibble: 3 x 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 stl_ets Test 67806. 98854. 83184. 6.43 8.76 1.40 1.38 -0.0544
2 ETS(Cost) Test 74918. 104385. 88099. 7.20 9.23 1.48 1.46 -0.0499
3 SNAIVE(Cost) Test -13781. 84804. 70596. -1.16 7.74 1.19 1.19 0.0552
train %>%
model(
SNAIVE(Cost)
) %>%
forecast(h = "3 years") %>%
autoplot(roids)
# FOOD TURNOVER -- ETS best
turn <- aus_retail %>%
index_by(Month) %>%
filter(Industry == "Food retailing") %>%
summarize(Turnover = sum(Turnover)) %>%
as_tsibble(index = Month)
train <- turn %>%
filter_index(.~ "2015 Nov")
my_dcmp_spec <- decomposition_model(
STL(log(Turnover) ~ season(window = Inf)),
ETS(season_adjust ~ season("N")), SNAIVE(season_year)
)
myfit <- train %>%
model(stl_ets = my_dcmp_spec) %>%
forecast(h = "3 years")
fc <- myfit %>% accuracy(turn)
myfit2 <- train %>%
model(
ETS(Turnover),
SNAIVE(Turnover)
) %>%
forecast(h = "3 years") %>%
accuracy(turn)
compare <- rbind(fc, myfit2)
compare
# A tibble: 3 x 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 stl_ets Test -249. 291. 258. -2.39 2.48 0.972 0.954 0.440
2 ETS(Turnover) Test -68.8 143. 123. -0.701 1.19 0.463 0.467 0.0797
3 SNAIVE(Turnover) Test 627. 702. 627. 5.89 5.89 2.37 2.30 0.615
train %>%
model(
ETS(Turnover)
) %>%
forecast(h = "3 years") %>%
autoplot(turn)
These figures differ most in the length of the time-series, T, which is represented by the dotted blue lines. Since no spikes are outside of these bounds for any of the graphs, then the time-series is probably white noise.
The critical values relate to the length of the time-series, existing at +/- 2/sqrt(T), where T is the length of the time-series. The autocorrelations differ for each because we expect each autocorrelation to be close to zero in a white noise series.
AMZN <- gafa_stock %>%
index_by(Date) %>%
filter(Symbol == "AMZN") %>%
summarize(Close = sum(Close)) %>%
mutate(day = row_number()) %>%
as_tsibble(index = day)
autoplot(AMZN)
acf <- AMZN %>% ACF(Close) %>%
autoplot() + labs(subtitle = "Amazon Closing Price")
acfD <- AMZN %>% ACF(difference(Close)) %>%
autoplot() + labs("Changes in Amazon closing price")
acf + acfD
#
#
# The ACF plot on the left is slow to drop to zero which is characteristic of non-stationary data.
#
# There are ten auto correlations outside of the 95% limits which suggest that the
# daily change in AMZN stock price is less random and has higher correlation with the
# previous days price.
# Ljung-Box test
AMZN %>%
mutate(diff_close = difference(Close)) %>%
features(diff_close, ljung_box, lag = 10)
# A tibble: 1 x 2
lb_stat lb_pvalue
<dbl> <dbl>
1 35.6 0.0000981
#
# A formal Portmanteau test for autocorrelation. r_k is the autocorrelation for lag k.
# If each r_k is close to 0, then Q will be small. If some r_k values are large (+/-), then
# Q will be large. l = 10 suggested for non-seasonal data.
#
# large values of Q suggest that the autocorrelations do not come from a white noise series.
# here we have a very small p-value, so we can conclude that the residuals are distinguishable
# from a white noise series. There is white noise present in this time-series.
# Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test
AMZN %>%
features(Close, unitroot_kpss)
# A tibble: 1 x 2
kpss_stat kpss_pvalue
<dbl> <dbl>
1 14.0 0.01
# small p-values suggest that differencing is required.
#
# p = 0.01, it is less than 0.01
# p = 0.1, it is greater than 0.1
#
#
# The data is non-stationary.
# differencing the data
AMZN %>%
mutate(diff_close = difference(Close)) %>%
features(diff_close, unitroot_kpss)
# A tibble: 1 x 2
kpss_stat kpss_pvalue
<dbl> <dbl>
1 0.149 0.1
#
# One difference is what it takes to make AMZN data stationary.
# This can determine whether seasonal differencing is required. This uses the measure
# of seasonal strength to determine the appropriate number of seasonal differences required.
unitroot_nsdiffs(AMZN)
nsdiffs
0
# 5.7 decomp
turkishGDP <- global_economy %>%
filter(Country == "Turkey") %>%
index_by(Year) %>%
summarize(GDP = GDP) %>%
as_tsibble(index = Year)
unitroot_nsdiffs(turkishGDP) # does not suggest differencing required
nsdiffs
0
turkishGDP %>% # does suggest differencing required
features(GDP, unitroot_kpss)
# A tibble: 1 x 2
kpss_stat kpss_pvalue
<dbl> <dbl>
1 1.22 0.01
gg_tsdisplay(turkishGDP)
turkishGDP %>%
gg_tsdisplay(difference(GDP), plot_type = 'partial')
# Box-Cox
lam = turkishGDP %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
turkish.bc <- turkishGDP %>%
mutate(GDP = box_cox(GDP, lam))
turkish.bc %>% # does suggest differencing required
features(GDP, unitroot_kpss)
# A tibble: 1 x 2
kpss_stat kpss_pvalue
<dbl> <dbl>
1 1.50 0.01
turkish.bc %>%
gg_tsdisplay(difference(GDP), plot_type = 'partial')
# log transformation
turkish.log <- turkishGDP %>%
mutate(GDP = log(GDP))
turkish.log %>%
gg_tsdisplay(difference(GDP), plot_type = 'partial')
turkish.log %>% # does suggest differencing required
features(GDP, unitroot_kpss)
# A tibble: 1 x 2
kpss_stat kpss_pvalue
<dbl> <dbl>
1 1.50 0.01
data <- aus_accommodation %>%
filter(State == "Tasmania") %>%
index_by(Quarter = Date) %>%
summarize(Takings = Takings) %>%
as_tsibble(index = Quarter)
gg_tsdisplay(data)
data %>%
gg_tsdisplay(difference(Takings), plot_type = 'partial')
data %>%
mutate(log_takings = log(Takings)) %>%
features(log_takings, unitroot_ndiffs)
# A tibble: 1 x 1
ndiffs
<int>
1 1
# returns 1, indicating 1 seasonal difference is required
# so we apply unitroot_ndiffs() funtion to the seasonally differenced data.
data %>%
mutate(log_turnover = difference(log(Takings),12)) %>%
features(log_turnover, unitroot_ndiffs)
# A tibble: 1 x 1
ndiffs
<int>
1 1
# these functions suggest we should do both a seasonal difference and a first difference.
souvenirs %>%
features(Sales, unitroot_ndiffs) # yes
# A tibble: 1 x 1
ndiffs
<int>
1 1
souvenirs %>%
mutate(log_sales = log(Sales)) %>%
features(log_sales, unitroot_ndiffs) # yes
# A tibble: 1 x 1
ndiffs
<int>
1 1
souvenirs %>%
mutate(log_sales = difference(log(Sales), 12)) %>% # no
features(log_sales, unitroot_kpss)
# A tibble: 1 x 2
kpss_stat kpss_pvalue
<dbl> <dbl>
1 0.239 0.1
# seasonal differencing is required but not a first difference
souvenirs$Sales %>% diff(lag = 12) %>% diff() %>% ggtsdisplay()
fit <- Arima(souvenirs$Sales, order=c(0,1,1), seasonal = c(0,1,1))
checkresiduals(fit, lag = 12)
Ljung-Box test
data: Residuals from ARIMA(0,1,1)
Q* = 56.715, df = 11, p-value = 3.755e-08
Model df: 1. Total lags used: 12
myarima = arima(souvenirs$Sales, order=c(2,1,1), seasonal=list(order = c(0,1,1)))
summary(myarima)
Call:
arima(x = souvenirs$Sales, order = c(2, 1, 1), seasonal = list(order = c(0,
1, 1)))
Coefficients:
ar1 ar2 ma1 sma1
0.3084 -0.0794 -0.9110 -0.9110
s.e. 0.1464 0.1411 0.0945 0.0945
sigma^2 estimated as 165843847: log likelihood = -895.45, aic = 1800.9
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 1131.804 12723.8 6423.095 -29.06718 50.50366 0.9393428 -0.01416315
checkresiduals(myarima)
Ljung-Box test
data: Residuals from ARIMA(2,1,1)
Q* = 5.4431, df = 6, p-value = 0.4884
Model df: 4. Total lags used: 10
myarima = auto.arima(souvenirs$Sales)
# credit:
# https://rpubs.com/azureblue83/478907
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.6*y[i-1] + e[i]
sim <- tsibble(idx = seq_len(100), y = y, index = idx)
autoplot(sim)
myfunction <- function(phi, sd=1, n=100){
y <- numeric(n) # put "numeric" in ts()
e <- rnorm(n, sd=sd)
for(i in 2:100)
y[i] <- phi*y[i-1] + e[i]
return(y)
}
mylist <- list()
i <- 0
phi.vals <- c(0.0006, 0.006, 0.06, 0.6)
for(phi in phi.vals){
i <- i + 1
mylist[[i]] <- myfunction(phi)
}
data <- do.call(dplyr::bind_cols , mylist)
data <- cbind(seq(1:100), data)
names(data)[1] <- "index"
names(data)[2:5] <- paste("phi.", phi.vals, sep = "")
names(data)[2] <- "phi.0.0006"
data %>%
as_tsibble(index = index)
# A tsibble: 100 x 5 [1]
index phi.0.0006 phi.0.006 phi.0.06 phi.0.6
<int> <dbl> <dbl> <dbl> <dbl>
1 1 0 0 0 0
2 2 0.467 -0.774 1.07 0.470
3 3 0.849 1.60 0.910 1.40
4 4 -0.188 0.146 0.739 1.24
5 5 -1.05 1.94 1.83 2.16
6 6 0.470 0.0965 -1.43 2.04
7 7 -0.136 -0.992 -0.500 -0.379
8 8 -0.367 0.734 0.200 -2.49
9 9 0.786 0.805 -0.354 -2.52
10 10 -0.935 -0.0684 -0.304 -2.67
# ... with 90 more rows
ggplot(data = data, mapping=aes(x = index)) +
geom_line(aes(y = phi.0.6), color = "red") +
geom_line(aes(y = phi.0.06), color = "blue") +
geom_line(aes(y = phi.0.006), color = "orange") +
geom_line(aes(y = phi.0.0006), color = "darkgreen")
# colnames(gen.data) <- paste('phi=', phi.vec, sep = '')
# autoplot(gen.data) + ylab('Data')
# These two lines do the same as all the code above. Just need ts() in function.
par(mfrow=c(1,4))
acf(data[,2], main="Phi=0.0006")
acf(data[,3], main="Phi=0.006")
acf(data[,4], main="Phi=0.06")
acf(data[,5], main="Phi=0.6")
ma.1 <- function(theta, sd=1, n=100){
y = ts(numeric(n))
e <- rnorm(n, sd=sd)
e[1] <- 0
for(i in 2:n)
y[i] <- theta*e[i-1] + e[i]
return(y)
}
mylist <- list()
i <- 0
theta.vec <- c(0.000000001, 0.00001, 0.01, 0.1) * 6
for(theta in theta.vec){
i <- i + 1
mylist[[i]] <- ma.1(theta)
}
gen.data <- do.call(cbind, mylist)
colnames(gen.data) <- paste("theta=", theta.vec, sep="")
autoplot(gen.data) + ylab('Data')
par(mfrow=c(1,4))
acf(gen.data[,1], main="Theta=0.000000006")
acf(gen.data[,2], main="Theta=0.00006")
acf(gen.data[,3], main="Theta=0.06")
acf(gen.data[,4], main="Theta=0.6")
# ARIMA(1,1)
y1 <- ts(numeric(100))
e <- rnorm(100, sd=1)
for(i in 2:100)
y1[i] <- 0.6*y1[i-1] + 0.6*e[i-1] + e[i]
autoplot(y1) +
ggtitle('ARMA(1,1) Generated Data')
# AR(2)
y2 <- ts(numeric(100))
e <- rnorm(100, sd=1)
for(i in 3:100)
y2[i] <- -0.8*y2[i-1] + 0.3*y2[i-2] + e[i]
autoplot(y2) +
ggtitle("AR(2) Generated Data")
par(mfrow=c(1,2))
acf(y1, main = "ARMA(1,1)")
acf(y2, main = "AR(2) Generated Data")
data <- aus_airpassengers %>%
as_tsibble(index = Year)
myfit <- data %>%
model(
ARIMA(Passengers)
)
report(myfit) # ARIMA(0,2,1) chosen
Series: Passengers
Model: ARIMA(0,2,1)
Coefficients:
ma1
-0.8963
s.e. 0.0594
sigma^2 estimated as 4.308: log likelihood=-97.02
AIC=198.04 AICc=198.32 BIC=201.65
gg_tsdisplay(data)
gg_tsresiduals(myfit) # residuals look like white noise
myfc <- myfit %>% forecast(h = 10)
myfc %>% autoplot(data)
data.s <- data$Passengers %>% diff(1) %>% diff(1)
ggtsdisplay(data.s)
data.s %>% unitroot_kpss() # 0.1 so we fail to reject the null, so,
kpss_stat kpss_pvalue
0.03614017 0.10000000
# the data is stationary
# there is a significant spike in the 1st lag on both the ACF and PACF plots.
# backshift operator model would be ARIMA(1,2,1)
(fit <- Arima(data$Passengers, order=c(1,2,1)))
Series: data$Passengers
ARIMA(1,2,1)
Coefficients:
ar1 ma1
-0.1404 -0.8758
s.e. 0.1555 0.0691
sigma^2 = 4.323: log likelihood = -96.62
AIC=199.24 AICc=199.83 BIC=204.66
checkresiduals(fit)
Ljung-Box test
data: Residuals from ARIMA(1,2,1)
Q* = 5.4475, df = 7, p-value = 0.6055
Model df: 2. Total lags used: 9
fit2 <- Arima(data$Passengers, order=c(0,1,0))
checkresiduals(fit2)
Ljung-Box test
data: Residuals from ARIMA(0,1,0)
Q* = 7.046, df = 9, p-value = 0.6323
Model df: 0. Total lags used: 9
# above and below the same -- two different ways
myfit2 <- data %>%
model(
ARIMA(Passengers ~ pdq(0,1,0))
)
report(myfit2)
Series: Passengers
Model: ARIMA(0,1,0) w/ drift
Coefficients:
constant
1.4191
s.e. 0.3014
sigma^2 estimated as 4.271: log likelihood=-98.16
AIC=200.31 AICc=200.59 BIC=203.97
gg_tsresiduals(myfit2)
myfit3 <- data %>%
model(
ARIMA(Passengers ~ 1 + pdq(2,1,2))
)
report(myfit3)
Series: Passengers
Model: ARIMA(2,1,2) w/ drift
Coefficients:
ar1 ar2 ma1 ma2 constant
-0.5518 -0.7327 0.5895 1.0000 3.2216
s.e. 0.1684 0.1224 0.0916 0.1008 0.7224
sigma^2 estimated as 4.031: log likelihood=-96.23
AIC=204.46 AICc=206.61 BIC=215.43
gg_tsresiduals(myfit3)
myfit4 <- data %>%
model(
ARIMA(Passengers ~ pdq(2,2,2))
)
report(myfit4)
Series: Passengers
Model: ARIMA(2,2,2)
Coefficients:
ar1 ar2 ma1 ma2
-1.0119 -0.1733 -0.0015 -0.7567
s.e. 0.3556 0.1558 0.3411 0.3047
sigma^2 estimated as 4.488: log likelihood=-96.42
AIC=202.84 AICc=204.38 BIC=211.87
gg_tsresiduals(myfit4)
myfit5 <- data %>%
model(
ARIMA(Passengers ~ 1 + pdq(0,2,1))
)
report(myfit5)
Series: Passengers
Model: ARIMA(0,2,1) w/ poly
Coefficients:
ma1 constant
-1.0000 0.0571
s.e. 0.0585 0.0213
sigma^2 estimated as 3.855: log likelihood=-95.1
AIC=196.21 AICc=196.79 BIC=201.63
gg_tsresiduals(myfit5)
# ARIMA(0,2,1) with a constant induces a quadratic or higher order polynomial trend.
# This is discouraged, can remove constant or reduce the number of differences.
data <- global_economy %>%
filter(Country == "United States") %>%
index_by(Year) %>%
summarize(GDP = GDP/1000) %>%
as_tsibble(index = Year)
ggtsdisplay(data$GDP)
# The plots do not appear to have seasonality and only have an upward trend.
# Differencing does not produce stationary data. Need box-cox.
# Box-Cox
lam = data %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
data.bc <- data %>%
mutate(GDP = box_cox(GDP, lam))
ggtsdisplay(data.bc$GDP) # still needs differencing
data.bc <- data.bc$GDP %>% diff(1)
ggtsdisplay(data.bc)
data.bc %>% unitroot_kpss()
kpss_stat kpss_pvalue
0.2082533 0.1000000
# Good -- needed box-cox transformation and differencing one time.
data.t <- dplyr::bind_cols(data[c(2:58),1], data.bc)
names(data.t)[2] <- "GDP"
myfit <- data.t %>%
model(
ARIMA(GDP)
)
report(myfit)
Series: GDP
Model: ARIMA(1,0,0) w/ mean
Coefficients:
ar1 constant
0.4586 16.8546
s.e. 0.1198 1.3555
sigma^2 estimated as 111.4: log likelihood=-214.31
AIC=434.62 AICc=435.07 BIC=440.75
#gg_tsresiduals(myfit)
myfit2 <- data.t %>% # best model
model(
ARIMA(GDP ~ pdq(1,1,1))
)
report(myfit2)
Series: GDP
Model: ARIMA(1,1,1)
Coefficients:
ar1 ma1
0.3325 -0.8461
s.e. 0.1849 0.1190
sigma^2 estimated as 117.1: log likelihood=-212.17
AIC=430.34 AICc=430.81 BIC=436.42
gg_tsresiduals(myfit2)
# The residuals are in-bounds and the distribution is relatively normal.
myfit3 <- data.t %>%
model(
ARIMA(GDP ~ pdq(0,1,0))
)
report(myfit3)
Series: GDP
Model: ARIMA(0,1,0)
sigma^2 estimated as 146.4: log likelihood=-219.08
AIC=440.16 AICc=440.24 BIC=442.19
#gg_tsresiduals(myfit3)
myfc <- myfit2 %>% forecast(h = 7)
myfc %>% autoplot(data.t)
# The forecasts look reasonable.
myets <- data %>%
model(
ETS(GDP)
)
report(myets)
Series: GDP
Model: ETS(M,A,N)
Smoothing parameters:
alpha = 0.9993593
beta = 0.5007829
Initial states:
l[0] b[0]
448093334 64917356
sigma^2: 7e-04
AIC AICc BIC
2389.487 2390.641 2399.789
gg_tsresiduals(myets) # residual plots looks roughly the same
fc2 <- myets %>% forecast(h = 7)
fc2 %>% autoplot(data)
data <- aus_arrivals %>%
filter(Origin == "Japan") %>%
summarize(Arrivals = Arrivals) %>%
as_tsibble(index = Quarter)
ggtsdisplay(data$Arrivals)
# Main plot appears to have seasonality present.
# ACF is slow to decrease, suggesting that the data is non-stationary.
data$Arrivals %>% unitroot_kpss()
kpss_stat kpss_pvalue
1.189464 0.010000
data$Arrivals %>% unitroot_ndiffs()
ndiffs
1
data.s <- data$Arrivals %>% diff(1)
data.s %>% unitroot_kpss() # only required one round of differencing
kpss_stat kpss_pvalue
0.2911011 0.1000000
ggtsdisplay(data.s)
# The residuals are highly correlated, with many significant autocorrelations present.
# The PACF plot appears to be exponentially decaying, suggesting an ARIMA(0,d,q) model.
myfit <- data %>% # my chosen model
model(
ARIMA(Arrivals ~ pdq(0,1,0) + PDQ(0,1,1))
)
report(myfit)
Series: Arrivals
Model: ARIMA(0,1,0)(0,1,1)[4]
Coefficients:
sma1
-0.6273
s.e. 0.0669
sigma^2 estimated as 2e+08: log likelihood=-1339.54
AIC=2683.08 AICc=2683.18 BIC=2688.69
gg_tsresiduals(myfit)
myfit2 <- data %>% # chosen best model -- ARIMA(0,1,1)(1,1,1)[4]
model(
ARIMA(Arrivals)
)
report(myfit2)
Series: Arrivals
Model: ARIMA(0,1,1)(1,1,1)[4]
Coefficients:
ma1 sar1 sma1
-0.4228 -0.2305 -0.5267
s.e. 0.0944 0.1337 0.1246
sigma^2 estimated as 174801727: log likelihood=-1330.66
AIC=2669.32 AICc=2669.66 BIC=2680.54
gg_tsresiduals(myfit2)
# ARIMA() gives a different model which is slightly better. This is determined by its
# smaller values for AIC, AICc, and BIC. However, my model has a higher log likelihood.
myfit3 <- data %>% # my chosen model - without backshift operator
model(
ARIMA(Arrivals ~ pdq(0,1,0))
)
report(myfit3)
Series: Arrivals
Model: ARIMA(0,1,0)(0,1,2)[4]
Coefficients:
sma1 sma2
-0.7021 0.1503
s.e. 0.0869 0.0966
sigma^2 estimated as 197681552: log likelihood=-1338.36
AIC=2682.72 AICc=2682.92 BIC=2691.13
gg_tsresiduals(myfit3)
data <- aus_production %>%
select(Quarter, Gas) %>%
as_tsibble(index = Quarter)
ggtsdisplay(data$Gas)
# The data is highly seasonal and the ACF plot suggests that it is non-stationary.
data$Gas %>% unitroot_kpss() # needs differencing
kpss_stat kpss_pvalue
4.419611 0.010000
data$Gas %>% unitroot_ndiffs()
ndiffs
1
data.s <- data$Gas %>% diff(1)
data.s %>% unitroot_kpss()
kpss_stat kpss_pvalue
0.0454123 0.1000000
ggtsdisplay(data.s)
# Seasonally differenced data
data %>% gg_tsdisplay(difference(log(Gas), 12),
plot_type = 'partial', lag_max = 24)
myfit <- data %>%
model(
ARIMA(Gas ~ pdq(1,0,0) + PDQ(1,1,0))
)
report(myfit)
Series: Gas
Model: ARIMA(1,0,0)(1,1,0)[4] w/ drift
Coefficients:
ar1 sar1 constant
0.7575 -0.4711 1.4387
s.e. 0.0462 0.0626 0.3050
sigma^2 estimated as 20.53: log likelihood=-626.26
AIC=1260.52 AICc=1260.71 BIC=1273.98
gg_tsresiduals(myfit)
myfit2 <- data %>%
model(
ARIMA(Gas ~ pdq(2,0,1) + PDQ(0,1,1))
)
report(myfit2)
Series: Gas
Model: ARIMA(2,0,1)(0,1,1)[4] w/ drift
Coefficients:
ar1 ar2 ma1 sma1 constant
0.9771 -0.0853 -0.3155 -0.5225 0.4343
s.e. 0.2660 0.2178 0.2510 0.0581 0.0978
sigma^2 estimated as 20.02: log likelihood=-622.6
AIC=1257.19 AICc=1257.6 BIC=1277.39
gg_tsresiduals(myfit2)
# ARIMA(2,0,1)(0,1,1)[4] is best according to AIC value
# Residuals have a few autocorrelations at the boundary. There may be a better
# model.
myfit3 <- data %>%
model(
ARIMA(Gas ~ pdq(3,0,0) + PDQ(1,1,1))
)
report(myfit3)
Series: Gas
Model: ARIMA(3,0,0)(1,1,1)[4] w/ drift
Coefficients:
ar1 ar2 ar3 sar1 sma1 constant
0.6827 0.0158 0.1669 -0.1875 -0.4357 0.6338
s.e. 0.0692 0.0876 0.0774 0.1198 0.1225 0.1643
sigma^2 estimated as 19.49: log likelihood=-619.33
AIC=1252.67 AICc=1253.21 BIC=1276.23
gg_tsresiduals(myfit3)
myfit4 <- data %>%
model(
ARIMA(Gas)
)
report(myfit4)
Series: Gas
Model: ARIMA(0,0,3)(0,1,0)[4] w/ drift
Coefficients:
ma1 ma2 ma3 constant
0.7684 0.5258 0.6811 4.0231
s.e. 0.0492 0.0712 0.0551 0.8661
sigma^2 estimated as 18.7: log likelihood=-616.36
AIC=1242.73 AICc=1243.02 BIC=1259.56
gg_tsresiduals(myfit4)
# This model performs the best according to the AIC.
# The residuals are not significantly improved.
myfc <- myfit4 %>% forecast(h = 24)
myfc %>% autoplot(data)
myets <- data %>%
model(
ETS(Gas)
)
report(myets) # much better AIC
Series: Gas
Model: ETS(M,A,M)
Smoothing parameters:
alpha = 0.6528545
beta = 0.1441675
gamma = 0.09784922
Initial states:
l[0] b[0] s[0] s[-1] s[-2] s[-3]
5.945592 0.07062881 0.9309236 1.177883 1.074851 0.8163427
sigma^2: 0.0032
AIC AICc BIC
1680.929 1681.794 1711.389
gg_tsresiduals(myets) # worse residuals
ets.fc <- myets %>% forecast(h = 24)
ets.fc %>% autoplot(data)
data.t <- dplyr::bind_cols(data[c(2:218),1], data.s)
names(data.t)[2] <- "Gas"
myfit <- data.t %>%
model(
ARIMA(Gas~ pdq(0,0,3) + PDQ(0,0,0))
)
report(myfit)
Series: Gas
Model: ARIMA(0,0,3) w/ mean
Coefficients:
ma1 ma2 ma3 constant
-0.7322 -0.5461 0.6187 1.0048
s.e. 0.0454 0.0446 0.0410 0.2661
sigma^2 estimated as 135.2: log likelihood=-839.43
AIC=1688.85 AICc=1689.14 BIC=1705.75
# best approach is probably to use a seasonal model with seasonally adjusted data.
# thats all folks