library(fpp3)
library(readxl)
piggies <- aus_livestock %>%
filter(Animal == "Pigs" & State == "Victoria") %>%
filter(Month >= yearmonth(as.Date("2010-01-01")) & Month <= yearmonth(as.Date("2018-08-01")))
pigplot <- piggies %>%
autoplot(Count) +
labs(title = 'Pigs Slaughtered')
pigplot
fitpig <- piggies %>%
model(ses = ETS(Count ~ error('A') + trend('N') + season('N')))
fitpig %>% report()
## Series: Count
## Model: ETS(A,N,N)
## Smoothing parameters:
## alpha = 0.2508843
##
## Initial states:
## l[0]
## 66303.48
##
## sigma^2: 48567138
##
## AIC AICc BIC
## 2327.637 2327.877 2335.570
fourmonth <- fitpig %>%
forecast(h = 4)
fourmonthplot <- fitpig %>%
forecast(h = 4) %>%
autoplot(piggies) +
labs(title = 'Four Month Forecast Data')
fourmonthplot
alpha = 0.2508843 l0 = 66303.48
firstfore <- fourmonth %>%
pull(Count) %>%
head(1)
standardDeviation <- augment(fitpig) %>%
pull(.resid) %>%
sd()
bottom <- firstfore - 1.96 * standardDeviation
top <- firstfore + 1.96 * standardDeviation
results <- c(bottom, top)
names(results) <- c('Lower', 'Upper')
results
## <distribution[2]>
## Lower Upper
## N(84133, 4.9e+07) N(110907, 4.9e+07)
hilo(fourmonth$Count, 95)
## <hilo[4]>
## [1] [83861.13, 111179.2]95 [83437.82, 111602.5]95 [83026.87, 112013.4]95
## [4] [82627.25, 112413.0]95
The 95% interval that I found is from 84,133 to 110,907
The 95% interval that R found were slightly wider (from 83,861.13 to 111,179.2)
bangEx <- global_economy %>%
filter(Code == 'BGD')
head(bangEx)
## # A tsibble: 6 x 9 [1Y]
## # Key: Country [1]
## Country Code Year GDP Growth CPI Imports Exports Population
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bangladesh BGD 1960 4274893913. NA NA 9.31 10.0 48199747
## 2 Bangladesh BGD 1961 4817580184. 6.06 NA 11.7 10.8 49592802
## 3 Bangladesh BGD 1962 5081413340. 5.45 NA 10.8 10.7 51030137
## 4 Bangladesh BGD 1963 5319458351. -0.456 NA 11.7 9.98 52532417
## 5 Bangladesh BGD 1964 5386054619. 11.0 NA 14.1 10.0 54129100
## 6 Bangladesh BGD 1965 5906636557. 1.61 NA 13.4 9.66 55834038
bangEx %>%
autoplot(Exports) +
labs(title = 'Bangladesh Exports')
This data displays a downward trend from 1960 to around 1975, then the data displays a upward trend from around 1975 to around 2012. Then the data starts to trend downwards again.
fitbang <- bangEx %>%
model(ANN = ETS(Exports ~ error('A') + trend('N') + season('N')))
bangExfore <- fitbang %>%
forecast(h = 10)
bangExfore %>% autoplot(bangEx) +
labs(title = 'Bangladesh 10 year Exports Forecast')
accuracy(fitbang)
## # A tibble: 1 × 11
## Country .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bangladesh ANN Training 0.0869 1.25 0.945 -0.891 12.0 0.983 0.991 0.0148
The RMSE for the training data is 1.253158.
bangcomp <- bangEx %>%
model(
ANN = ETS(Exports ~ error('A') + trend('N') + season('N')),
AAN = ETS(Exports ~ error('A') + trend('A') + season('N')))
accuracy(bangcomp)
## # A tibble: 2 × 11
## Country .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bangladesh ANN Training 0.0869 1.25 0.945 -0.891 12.0 0.983 0.991 0.0148
## 2 Bangladesh AAN Training 0.00557 1.25 0.954 -1.90 12.2 0.993 0.989 0.0135
The AAN model has a slightly lower RMSE which means that it is a more accurate model for this data.
bangcomp %>%
forecast(h = 10) %>%
autoplot(bangEx) +
labs(title = 'Bangladesh Exports Model Comparison')
The forecast looks like the AAN model is better for forecasting this data. The ANN forecast shows the data leveling off which does not fit the overall trend of the data. The AAN model shows an upward trend in the data which seems like a much more accurate forecast with the trend of the data.
chineseGDP <- global_economy %>%
filter(Country == 'China')
chineseGDP %>% autoplot(GDP) +
labs(title = 'Chinese GDP')
bigL <- chineseGDP %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
chineseGDPComp <- chineseGDP %>%
model(
ETS = ETS(GDP),
BoxCox = ETS(box_cox(GDP, bigL)),
Damped = ETS(GDP ~ trend('Ad', phi = 0.9)),
Log = ETS(log(GDP)))
chineseGDPComp %>%
forecast(h = 20) %>%
autoplot(chineseGDP, level = NULL) +
labs(title = 'Chinese GDP ETS Forecast Options Comparison')
aus_production %>%
autoplot(Gas)
fitaus <- aus_production %>%
model(Multiplicative = ETS(Gas ~ error("M") + trend("A") + season("M")))
report(fitaus)
## Series: Gas
## Model: ETS(M,A,M)
## Smoothing parameters:
## alpha = 0.6528545
## beta = 0.1441675
## gamma = 0.09784922
##
## Initial states:
## l[0] b[0] s[0] s[-1] s[-2] s[-3]
## 5.945592 0.07062881 0.9309236 1.177883 1.074851 0.8163427
##
## sigma^2: 0.0032
##
## AIC AICc BIC
## 1680.929 1681.794 1711.389
fitaus %>%
forecast(h = 36) %>%
autoplot(aus_production)
fitagain <- aus_production %>%
model(Damped = ETS(Gas ~ error("A") + trend("Ad") + season("N")))
report(fitagain)
## Series: Gas
## Model: ETS(A,Ad,N)
## Smoothing parameters:
## alpha = 0.08442082
## beta = 0.01242805
## phi = 0.9799999
##
## Initial states:
## l[0] b[0]
## 5.919847 0.2516645
##
## sigma^2: 251.9636
##
## AIC AICc BIC
## 2386.146 2386.544 2406.453
fitagain %>%
forecast(h = 36) %>%
autoplot(aus_production)
modelComparison <- aus_production %>%
model(
Multiplicative = ETS(Gas ~ error("M") + trend("A") + season("M")),
Damped = ETS(Gas ~ error("A") + trend("Ad") + season("N")))
glance(modelComparison)
## # A tibble: 2 × 9
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Multiplicative 0.00324 -831. 1681. 1682. 1711. 21.1 32.2 0.0413
## 2 Damped 252. -1187. 2386. 2387. 2406. 246. 241. 11.7
multiplicative seasonality is necessary due to the fact that the seasonal variation seems to change proportionally to the level.
The multiplicative method slightly outperformed the damped version of the model across the board. This means that damping didn’t improve the model and that the multiplicative method is a stronger model.
set.seed(7777777)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
myseries %>% autoplot(Turnover)
Multiplicative seasonality is good for this data because the magnitude of the seasonality fluctuation increases with the level.
fitturn <- myseries %>%
model(
'Multiplicative Method' = ETS(Turnover ~ error('M') + trend('A') + season('M')),
'Damped Method' = ETS(Turnover ~ error('M') + trend('Ad') + season('M'))
)
HoltWinters <- fitturn %>% forecast(h = 36)
HoltWinters %>% autoplot(myseries, level = NULL)
accuracy(fitturn) %>% select('.model', 'RMSE')
## # A tibble: 2 × 2
## .model RMSE
## <chr> <dbl>
## 1 Multiplicative Method 10.9
## 2 Damped Method 11.0
The multiplicative method has a slightly lower RMSE than the damped method. This means that this may be the more accurate model for the data.
fitturn %>%
select('Multiplicative Method') %>%
gg_tsresiduals()
#better
fitturn %>%
select('Damped Method') %>%
gg_tsresiduals()
Both the residuals plot and histogram for the multiplicative method confirms that the residuals look like white noise with the exception of a few outliers. The ACF confirms that most of the residuals are within bounds. The damped method’s residuals look a little less like white noise and the residuals arent normally distributed.
myseries_train <- myseries %>%
filter(year(Month) < 2011)
fitty <- myseries_train %>%
model(
'Multiplicative Method' = ETS(Turnover ~ error('M') + trend('A') + season('M')),
'Damped Method' = ETS(Turnover ~ error('M') + trend('Ad') + season('M')),
'Seasonal Naive' = SNAIVE(Turnover))
comparison <- anti_join(myseries, myseries_train, by = c('State', 'Industry', 'Series ID', 'Month', 'Turnover'))
forecastResults <- fitty %>% forecast(comparison)
autoplot(comparison, Turnover) +
autolayer(forecastResults, level = NULL) +
labs(title = 'Forecast Method Comparison')
accuracy(fitty)
## # A tibble: 3 × 12
## State Indus…¹ .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victo… Hardwa… Multi… Trai… -0.453 9.86 7.22 -0.547 5.07 0.491 0.524 0.251
## 2 Victo… Hardwa… Dampe… Trai… 0.491 9.78 7.22 0.214 5.05 0.490 0.519 0.0581
## 3 Victo… Hardwa… Seaso… Trai… 6.74 18.8 14.7 3.93 9.84 1 1 0.759
## # … with abbreviated variable name ¹Industry
The Damped and Multiplicative methods both beat the Seasonal Niave method, with the Multiplicative method being the most accurate.
trippy <- tourism %>%
summarise(Trips = sum(Trips))
trippy %>%
autoplot(Trips)
The data is seasonal. There is a downward trend until around 2010, then there is a stronger upward trend.
decomp <- trippy %>%
model(STL(Trips)) %>% components()
decomp %>%
as_tsibble() %>%
autoplot(season_adjust)
trippy %>%
model(
decomposition_model(STL(Trips),
ETS(season_adjust ~ error("A") + trend("Ad") + season("N")))) %>%
forecast(h = "2 years") %>%
autoplot(trippy)
trippy %>%
model(decomposition_model(STL(Trips),
ETS(season_adjust ~ error("A") + trend("A") + season("N")))) %>%
forecast(h = "2 years") %>%
autoplot(trippy)
trippy %>%
model(ETS(Trips)) %>%
forecast(h = "2 years") %>%
autoplot(trippy)
fittrip <- trippy %>%
model(
Damped = decomposition_model(STL(Trips),
ETS(season_adjust ~ error("A") + trend("Ad") + season("N"))),
Holt = decomposition_model(STL(Trips),
ETS(season_adjust ~ error("A") + trend("A") + season("N"))),
ETS = ETS(Trips))
accuracy(fittrip)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Damped Training 103. 763. 576. 0.367 2.72 0.607 0.629 -0.0174
## 2 Holt Training 99.7 763. 574. 0.359 2.71 0.604 0.628 -0.0182
## 3 ETS Training 105. 794. 604. 0.379 2.86 0.636 0.653 -0.00151
Using the RMSE value, the Holt method to the STL Decomposition is the slightly better in-sample method.
fittrip %>%
forecast(h = "2 years") %>%
autoplot(trippy, level = NULL) +
guides(colour=guide_legend(title="Forecast"))
The forecasts are all very similar therefore, the model with the smallest RMSE value is most likely to be the best. (Holt)
goodest <- fittrip %>%
select(Holt)
goodest %>%
gg_tsresiduals()
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).
augment(goodest) %>%
features(.resid, ljung_box, lag = 24, dof = 5)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 Holt 33.3 0.0221
The residuals seem to be close white noise, but when I did the ljung box test I found that my p-value is less than 0.05. This means that the residuals are not white noise. I believe this is due to a huge outlier.
bigZ <- aus_arrivals %>% filter(Origin == 'NZ')
autoplot(bigZ, Arrivals) +
ggtitle("Arrivals from New Zealand")
The data has an upward trend, also the number of arrivals fall each Q1.
choochoo <- bigZ %>%
slice(1:(nrow(bigZ)-(4*2)))
choochoo %>%
model(
ETS(Arrivals ~ error("M") + trend("A") + season("M"))) %>%
forecast(h="2 years") %>%
autoplot(level = NULL) +
autolayer(bigZ, Arrivals) +
labs(title = "Train vs Test")
Multiplicative seasonality is important because the size of the seasonal pattern grows in proportion to the level of the trend.
letssee <- anti_join(bigZ, choochoo,
by = c("Quarter", "Origin", "Arrivals"))
fitem <- choochoo %>%
model(
'ETS model' = ETS(Arrivals),
'Additive to Log' = ETS(log(Arrivals) ~ error("A") + trend("A") + season("A")),
'Seasonal Naïve Method' = SNAIVE(Arrivals),
'STL decomposition to Log' = decomposition_model(STL(log(Arrivals)),
ETS(season_adjust)))
fofo <- fitem %>%
forecast(h="2 years")
fofo %>%
autoplot(level = NULL) +
autolayer(letssee, Arrivals) +
guides(colour=guide_legend(title="Forecast")) +
labs(title = "New Zealand Visitors")
fofo %>% accuracy(bigZ)
## # A tibble: 4 × 11
## .model Origin .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Additive … NZ Test -7093. 17765. 12851. -2.17 4.20 0.864 0.918 0.0352
## 2 ETS model NZ Test -3495. 14913. 11421. -0.964 3.78 0.768 0.771 -0.0260
## 3 Seasonal … NZ Test 9709. 18051. 17156. 3.44 5.80 1.15 0.933 -0.239
## 4 STL decom… NZ Test -12535. 22723. 16172. -4.02 5.23 1.09 1.17 0.109
daddymodel <- fitem %>% select('ETS model')
daddymodel %>%
gg_tsresiduals()
augment(daddymodel) %>%
features(.resid, ljung_box)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ETS model 1.29 0.256
Using the RMSE value, the best forecast is the ETS model’s forecast. When you look at the plot this also looks the best.
Both the acf plot and ljung box tests show that the ETS model passes the residual tests. Everything in the ACF plot is within the bounds and the ljung box value is more than 0.05.
vallygirl <- bigZ %>%
slice(1:(n() - 3)) %>%
stretch_tsibble(.init = 36, .step = 3)
vallygirl %>%
model(
'ETS model' = ETS(Arrivals),
'Additive to Log' = ETS(log(Arrivals) ~ error("A") + trend("A") + season("A")),
'Seasonal Naïve Method' = SNAIVE(Arrivals),
'STL decomposition to Log' = decomposition_model(STL(log(Arrivals)), ETS(season_adjust))) %>%
forecast(h = 3) %>%
accuracy(bigZ)
## # A tibble: 4 × 11
## .model Origin .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Additive to Log NZ Test -375. 14347. 11021. 0.200 5.83 0.741 0.746 0.309
## 2 ETS model NZ Test 4627. 15327. 11799. 2.23 6.45 0.793 0.797 0.283
## 3 Seasonal Naïve… NZ Test 8244. 18768. 14422. 3.83 7.76 0.970 0.976 0.566
## 4 STL decomposit… NZ Test 4252. 15618. 11873. 2.04 6.25 0.798 0.812 0.244
Using cross validation, the best method is Additive to Log. The previous best model, ETS model, is the second best using cross validation.
hardstuff <- aus_production %>%
select(Quarter, Cement) %>%
as_tsibble(index = Quarter)
autoplot(hardstuff)
## Plot variable not specified, automatically selected `.vars = Cement`
hardvally <- hardstuff %>%
slice(1:(n()-4)) %>%
stretch_tsibble(.init = 5, .step = 1)
cementmod <- hardvally %>%
model(ETS(Cement),
SNAIVE(Cement)) %>%
forecast(h = "1 year")
vallygirl <- hardstuff %>%
stretch_tsibble(.init = 5, .step = 4)
cementmod2 <- vallygirl %>%
model(ETS(Cement),
SNAIVE(Cement)) %>%
forecast(h= "1 year")
cementacc <- accuracy(cementmod2, hardstuff) %>%
mutate(MSE = (RMSE)^2) %>%
select(MSE, RMSE)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 3 observations are missing between 2010 Q3 and 2011 Q1
cementacc
## # A tibble: 2 × 2
## MSE RMSE
## <dbl> <dbl>
## 1 9962. 99.8
## 2 18028. 134.
The ETS seems to be the most accurate model. I expected the ETS model to be best because it is designed for longer data sets and has no trouble estimating the parameters and trend if necessary.