library(readxl)
library(tidyquant)
library(fpp3)
library(moments)
library(tsibble)
library(tsibbledata)
library(ggplot2)
library(dplyr)
library(seasonal)
library(seasonalview)
library(fabletools)
library(fable)
library(ggfortify)
library(quantmod)
library(GGally)
library(feasts)
#Question 1
#Question 1 Part a
aus_livestock
## # A tsibble: 29,364 x 4 [1M]
## # Key: Animal, State [54]
## Month Animal State Count
## <mth> <fct> <fct> <dbl>
## 1 1976 Jul Bulls, bullocks and steers Australian Capital Territory 2300
## 2 1976 Aug Bulls, bullocks and steers Australian Capital Territory 2100
## 3 1976 Sep Bulls, bullocks and steers Australian Capital Territory 2100
## 4 1976 Oct Bulls, bullocks and steers Australian Capital Territory 1900
## 5 1976 Nov Bulls, bullocks and steers Australian Capital Territory 2100
## 6 1976 Dec Bulls, bullocks and steers Australian Capital Territory 1800
## 7 1977 Jan Bulls, bullocks and steers Australian Capital Territory 1800
## 8 1977 Feb Bulls, bullocks and steers Australian Capital Territory 1900
## 9 1977 Mar Bulls, bullocks and steers Australian Capital Territory 2700
## 10 1977 Apr Bulls, bullocks and steers Australian Capital Territory 2300
## # … with 29,354 more rows
aus_pigs <- aus_livestock %>%
filter(Animal == "Pigs") %>%
filter(State == "Victoria") %>%
filter(Month >= yearmonth(as.Date("2010-01-01")))
aus_pigs_expo <- aus_pigs %>%
model(ETS(Count ~ error("A") + trend("N") + season("N")))
report(aus_pigs_expo)
## Series: Count
## Model: ETS(A,N,N)
## Smoothing parameters:
## alpha = 0.225021
##
## Initial states:
## l[0]
## 67178.49
##
## sigma^2: 49463947
##
## AIC AICc BIC
## 2423.061 2423.292 2431.107
#The optimal value of alpha is 0.225021 and the optimal value of lambda is 67178.49.
aus_pigs_expo %>%
forecast(h = 4) %>%
autoplot(aus_pigs) +
geom_line(aes(y = .fitted), col="#D55E00",
data = augment(aus_pigs_expo)) +
labs(title = "Forecast of Slaugtered Victorian Pigs using Simple Exponential Smoothing (h = 4)")
#Question 1 Part b
pigcast <- aus_pigs_expo %>%
forecast(h = 4)
pigcast %>%
hilo(95)
## # A tsibble: 4 x 7 [1M]
## # Key: Animal, State, .model [1]
## Animal State .model Month Count .mean `95%`
## <fct> <fct> <chr> <mth> <dist> <dbl> <hilo>
## 1 Pigs Victor… "ETS(… 2019 Jan N(95295, 4.9e+07) 95295. [81510.15, 109079.2]95
## 2 Pigs Victor… "ETS(… 2019 Feb N(95295, 5.2e+07) 95295. [81165.48, 109423.9]95
## 3 Pigs Victor… "ETS(… 2019 Mar N(95295, 5.4e+07) 95295. [80829.01, 109760.4]95
## 4 Pigs Victor… "ETS(… 2019 Apr N(95295, 5.7e+07) 95295. [80500.19, 110089.2]95
95295 - 1.96 *(sqrt(49463947)) #Manually took the point forecast out and took the sqrt of sigma squared, which was calculated in part a.
## [1] 81510.2
95295 + 1.96 *(sqrt(49463947))
## [1] 109079.8
#The prediction interval I manually calculated is [81510.2, 109079.8]. The prediction intervals R calculated are more precise since they are for each forecasted period, although they are still very close to my manually calculated values.
#Question 2
#Question 2 Part a
global_economy %>%
distinct(Country)
## # A tibble: 263 × 1
## Country
## <fct>
## 1 Afghanistan
## 2 Albania
## 3 Algeria
## 4 American Samoa
## 5 Andorra
## 6 Angola
## 7 Antigua and Barbuda
## 8 Arab World
## 9 Argentina
## 10 Armenia
## # … with 253 more rows
brazil_econ <- global_economy %>%
filter(Country == "Brazil")
brazil_train <- brazil_econ %>%
filter(Year <= year(as.Date("2015-01-01")))
brazil_econ %>%
autoplot(Exports) +
labs(title = "Brazil Exports")
#Brazil's exports fluctuates a lot in the short run although there is a steady positive growth trend.
#Question 2 Part b
brazil_econ_expo <- brazil_train %>%
model(ETS(Exports ~ error("A") + trend("N") + season("N")))
brazil_econ_expo %>%
forecast(h = 10)%>%
autoplot(brazil_train) +
labs(title = "Ten Year Forecast of Brazilian Exports") +
labs(subtitle = "Using ETS(A,N,N)")
#Question 2 Part c
accuracy(brazil_econ_expo)
## # A tibble: 1 × 11
## Country .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Brazil "ETS(Exports… Trai… 0.121 1.58 1.18 -0.859 13.3 0.971 0.977 0.00195
#My RMSE is 1.58.
#Question 2 Part d
brazil_econ_linear <- brazil_train %>%
model(ETS(Exports ~ error("A") + trend("A") + season("N")))
accuracy(brazil_econ_linear)
## # A tibble: 1 × 11
## Country .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Brazil "ETS(Export… Trai… 1.27e-4 1.58 1.19 -2.24 13.5 0.977 0.974 0.00810
#The ANN model is almost identical to the naive method since there is no trend or seasonality. There is a slight trend in the data, so while the RMSEs are the same the AAN method is best for forecasting this data between the two.
#Question 2 Part e
brazil_econ_linear %>%
forecast(h = 10)%>%
autoplot(brazil_train) +
labs(title = "Ten Year Forecast of Brazil Exports") +
labs(subtitle = "Using ETS(A,A,N)")
#After comparing the plots for the two methods, it definitely appears that the AAN is the best since it captures the slight trend present.
#Question 3
china_gdp <- global_economy %>%
filter(Country == "China")
autoplot(china_gdp) +
labs(title = "Chinese GDP")
## Plot variable not specified, automatically selected `.vars = GDP`
china_log <- china_gdp %>%
mutate(GDP = log(GDP))
autoplot(china_log) +
labs(title = "Logged Chinese GDP") #Logged GDP to smooth out the exponential-like growth China experienced
## Plot variable not specified, automatically selected `.vars = GDP`
china_ann <- china_gdp %>%
model(ETS(GDP ~ error("A") + trend("N") + season("N")))
china_ann %>%
forecast(h = 20)%>%
autoplot(china_gdp) +
labs(title = "Twenty Year Forecast of Chinese GDP (ETS = (A,N,N))")
#The very strong and clear upward trend is completely ignored by this model. Let's used the logged data and see if it looks better.
china_lann <- china_log %>%
model(ETS(GDP ~ error("A") + trend("N") + season("N")))
china_lann %>%
forecast(h = 20)%>%
autoplot(china_log) +
labs(title = "Twenty Year Forecast Logged of Chinese GDP") +
labs(subtitle = "Using ETS(A,N,N)")
#This forecast looks a lot better, although it is still missing out on most of the upward trend. I think the logged AAN method will yield the best results.
china_aan <- china_gdp %>%
model(ETS(GDP ~ error("A") + trend("A") + season("N")))
china_aan %>%
forecast(h = 20)%>%
autoplot(china_gdp) +
labs(title = "Twenty Year Forecast of Chinese GDP") +
labs(subtitle = "Using ETS(A,A,N)")
#This forecast looks a lot better than the previous ones. It definitely needs a damp since the growth will slow down in the future, but I will use AAN on the logged data first so I can compare.
china_laan <- china_log %>%
model(ETS(GDP ~ error("A") + trend("A") + season("N")))
china_laan %>%
forecast(h = 20)%>%
autoplot(china_log) +
labs(title = "Twenty Year Forecast of Chinese Logged GDP") +
labs(subtitle = "Using ETS(A,A,N)")
#This forecast looks even better than the last one since the exponential-like growth has been smoothed out. However, the growth will slow down in the future so a damp is definitely necessary.
china_gdp %>%
model(`Damped Holt's method` = ETS(GDP ~ error("A") +
trend("Ad", phi = 0.9) + season("N"))) %>%
forecast(h = 20) %>%
autoplot(china_gdp) +
labs(title = "Twenty Year Forecast of Chinese GDP") +
labs(subtitle = "Using ETS(A,A,N) with a Damping Component")
#This forecast controls for limited growth into the future but is being forecasted off an uncontrolled-for exponential-like growth. I expect the logged damped that I will run next to work best.
china_log %>%
model(`Damped Holt's method` = ETS(GDP ~ error("A") +
trend("Ad", phi = 0.9) + season("N"))) %>%
forecast(h = 20) %>%
autoplot(china_log) +
labs(title = "Twenty Year Forecast of Chinese Logged GDP") +
labs(subtitle = "Using ETS(A,A,N) with a Damping Component")
#As expected, this forecast looks a lot better than all previous ones. However, none of the methods I have tried fully capture the exponential growth, at least in the short-run. Box-Cox is not applicable here since there is no seasonality.
#Question 4
aus_gas <- aus_production %>%
select("Gas")
autoplot(aus_gas) +
labs(title = "Australian Gas Production")
## Plot variable not specified, automatically selected `.vars = Gas`
#Multiplicative seasonality is necessary since the variance of the seasonality is increasing overtime.
aus_hwplusdamp <- aus_gas %>%
model(
hw = ETS(Gas ~ error("M") + trend("A") + season("M")),
hwdamped = ETS(Gas ~ error("M") + trend("Ad", phi = 0.9) + season("M")))%>%
forecast(h = 36) %>%
autoplot(aus_gas) +
labs(title = "Thirty-Six Quarter Forecast of Australian Gas Production") +
labs(subtitle = "Using Holt-Winters Multiplicative Method")
aus_hwplusdamp
res_hw <- aus_gas %>%
model(
hw = ETS(Gas ~ error("M") + trend("A") + season("M")))
res_hwdamped <- aus_gas %>%
model(
hw = ETS(Gas ~ error("M") + trend("Ad", phi = 0.9) + season("M")))
residuals(res_hw)
## # A tsibble: 218 x 3 [1Q]
## # Key: .model [1]
## .model Quarter .resid
## <chr> <qtr> <dbl>
## 1 hw 1956 Q1 0.0181
## 2 hw 1956 Q2 -0.0958
## 3 hw 1956 Q3 0.0267
## 4 hw 1956 Q4 0.0901
## 5 hw 1957 Q1 -0.0387
## 6 hw 1957 Q2 0.0495
## 7 hw 1957 Q3 -0.0986
## 8 hw 1957 Q4 0.0358
## 9 hw 1958 Q1 -0.0350
## 10 hw 1958 Q2 0.0501
## # … with 208 more rows
residuals(res_hwdamped)
## # A tsibble: 218 x 3 [1Q]
## # Key: .model [1]
## .model Quarter .resid
## <chr> <qtr> <dbl>
## 1 hw 1956 Q1 0.0429
## 2 hw 1956 Q2 -0.0871
## 3 hw 1956 Q3 0.0237
## 4 hw 1956 Q4 0.0877
## 5 hw 1957 Q1 -0.0456
## 6 hw 1957 Q2 0.0483
## 7 hw 1957 Q3 -0.0925
## 8 hw 1957 Q4 0.0339
## 9 hw 1958 Q1 -0.0381
## 10 hw 1958 Q2 0.0577
## # … with 208 more rows
#The damped model appears to have reduced the size of most of the residuals which indicates a better fit for the data. Looking at the plot, it also appears to be the better forecast.
#Question 5
#Question 5 Part a
set.seed(105)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
myseries %>% autoplot(Turnover)
#Multiplicative seasonality is needed since the variance in the seasonality is not constant and is increasing over time.
#Question 5 Part b
myseries_holtmult <- myseries %>%
model(
hw = ETS(Turnover ~ error("M") + trend("A") + season("M")))%>%
forecast(h = 36) %>%
autoplot(myseries) +
labs(title = "Three Year Forecast of Australian Retail Turnover") +
labs(subtitle = "Using Holt-Winters Multiplicative Method")
myseries_holtmult
myseries_damped <- myseries %>%
model(
hwdamped = ETS(Turnover ~ error("M") + trend("Ad", phi = 0.9) + season("M"))) %>%
forecast(h = 36) %>%
autoplot(myseries) +
labs(title = "Three Year Forecast of Australian Retail Turnover") +
labs(subtitle = "Using Damped Holt-Winters Multiplicative Method")
myseries_damped
#Question 5 Part c
myseries %>%
stretch_tsibble(.init = 10) %>%
model(
hw = ETS(Turnover ~ error("M") + trend("A") + season("M")),
hwdamped = ETS(Turnover ~ error("M") + trend("Ad", phi = 0.9) + season("M")))%>%
forecast(h = 1) %>%
accuracy(myseries)
## Warning: 8 errors (2 unique) encountered for hw
## [3] A seasonal ETS model cannot be used for this data.
## [5] Not enough data to estimate this ETS model.
## Warning: 8 errors (2 unique) encountered for hwdamped
## [3] A seasonal ETS model cannot be used for this data.
## [5] Not enough data to estimate this ETS model.
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 1 observation is missing at 2019 Jan
## # A tibble: 2 × 12
## .model State Indus…¹ .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hw Queen… Food r… Test 1.18 23.3 17.6 0.126 2.05 0.295 0.320 0.220
## 2 hwdamped Queen… Food r… Test 4.10 23.3 17.4 0.471 2.00 0.292 0.320 0.0782
## # … with abbreviated variable name ¹Industry
#The RMSEs for the HW and the HW-damped are identical, which checks out since the plots looked very similar. I prefer the damped method since retail is declining in the face of online shopping. Turnover will likely decline with it and a good forecast should capture this.
#Question 5 Part d
dampedcheck <- myseries %>%
model(hw = ETS(Turnover ~ error("M") + trend("Ad", phi = 0.9) + season("M")))
gg_tsresiduals(dampedcheck)
#The ACF plot shows significant autocorrelation so these residuals do not resemble white noise.
#Question 5 Part e
myseries %>%
filter(year(Month) < 2011) %>%
model(
snaive = SNAIVE(Turnover),
hw = ETS(Turnover ~ error("M") + trend("A") + season("M")),
hwdamped = ETS(Turnover ~ error("M") + trend("Ad", phi = 0.9) + season("M"))
) %>%
forecast(h = "7 years") %>%
accuracy(myseries)
## # A tibble: 3 × 12
## .model State Indus…¹ .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hw Queens… Food r… Test -102. 126. 105. -4.91 5.09 1.87 1.77 0.899
## 2 hwdamped Queens… Food r… Test 186. 226. 188. 8.81 8.96 3.35 3.17 0.898
## 3 snaive Queens… Food r… Test 292. 321. 292. 14.2 14.2 5.19 4.51 0.911
## # … with abbreviated variable name ¹Industry
#The HW and HW damped both beat the SNAIVE since their RMSEs are lower.
#Question 6
#Question 6 Part a
aus_trips <- tourism %>%
summarise(Trips = sum(Trips))
aus_trips %>%
autoplot(Trips)
#The data has a significant degree of seasonality. It appears to follow a very slight downward trend until around 2010 when it enters a strong positive growth trend.
#Question 6 Part b
aus_stl <- aus_trips %>%
model(STL(Trips)) %>%
components()
aus_stl %>%
as_tsibble() %>%
autoplot(season_adjust)
#Question 6 Part c
stldamped <- decomposition_model(
STL(Trips),
ETS(season_adjust ~ error("A") + trend("Ad") + season("N"))
)
aus_trips %>%
model(dcmp_AAdN = stldamped) %>%
forecast(h = "2 years") %>%
autoplot(aus_trips)
#Question 6 Part d
stlholt <- decomposition_model(
STL(Trips),
ETS(season_adjust ~ error("A") + trend("A") + season("N")))
aus_trips %>%
model(dcmp_AAN = stlholt) %>%
forecast(h = "2 years") %>%
autoplot(aus_trips)
#Question 6 Part e
aus_trips %>%
model(ets = ETS(Trips)) %>%
forecast(h = "2 years") %>%
autoplot(aus_trips)
#Question 6 Part f
fitcheck6 <- aus_trips %>%
model(
dcmp_AAdN = stldamped,
dcmp_AAN = stlholt,
ets = ETS(Trips)
)
accuracy(fitcheck6)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 dcmp_AAdN Training 103. 763. 576. 0.367 2.72 0.607 0.629 -0.0174
## 2 dcmp_AAN Training 99.7 763. 574. 0.359 2.71 0.604 0.628 -0.0182
## 3 ets Training 105. 794. 604. 0.379 2.86 0.636 0.653 -0.00151
#The RMSEs for the damped and HW are identical. The ETS has the worst fit.
#Question 6 Part g
fitcheck6 %>%
forecast(h = "2 years") %>%
autoplot(aus_trips, level = NULL)
#The three forecasts are very similar. I would not pick the ETS due to the relatively high RMSE. Between the two AANs, I would pick the regular undamped HW since the number of trips will hit a "maximum" at some point. Australian air traffice can only sustain so many travellers.
#Question 6 Part h
residual6 <- fitcheck6 %>%
select(dcmp_AAN)
augment(residual6) %>%
gg_tsdisplay(.resid, lag_max = 24, plot_type = "histogram")
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).
augment(residual6) %>%
features(.innov, ljung_box, lag = 24, dof = 4)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 dcmp_AAN 33.3 0.0311
#The p-value of the LB test is <.05 so we can reject the null hypothesis and say that the residuals do not resemble white noise.
#Question 7
#Question 7 Part a
nz <- aus_arrivals %>%
filter(Origin == "NZ")
nz %>%
autoplot(Arrivals / 1e3) + labs(y = "Thousands of People")
#There is an obvious upward trend with a high degree of seasonality. It also appears that the variance of the seasonality is not constant.
#Question 7 Part b
nz_train <- nz %>%
slice(1:(n() - 8))
nz_train %>%
model(ETS(Arrivals ~ error("M") + trend("A") + season("M"))) %>%
forecast(h = "2 years") %>%
autoplot() +
autolayer(nz, Arrivals)
#Question 7 Part c
#Multiplicative seasonality is necessary since the variation in seasonality is not constant over time.
#Question 7 Part d
finalstep <- anti_join(nz, nz_train,
by = c("Quarter", "Origin", "Arrivals"))
nz_fc <- nz_train %>%
model(
ets = ETS(Arrivals),
log_ets = ETS(log(Arrivals)),
snaive = SNAIVE(Arrivals),
stl = decomposition_model(STL(log(Arrivals)), ETS(season_adjust)))
forecast7 <- nz_fc %>%
forecast(h = "2 years")
forecast7 %>%
autoplot(level = NULL) +
autolayer(finalstep, Arrivals)
#Question 7 Part e
forecast7 %>% accuracy(nz)
## # A tibble: 4 × 11
## .model Origin .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ets NZ Test -3495. 14913. 11421. -0.964 3.78 0.768 0.771 -0.0260
## 2 log_ets NZ Test 2467. 13342. 11904. 1.03 4.03 0.800 0.689 -0.0786
## 3 snaive NZ Test 9709. 18051. 17156. 3.44 5.80 1.15 0.933 -0.239
## 4 stl NZ Test -12535. 22723. 16172. -4.02 5.23 1.09 1.17 0.109
checkres7 <- nz_fc %>%
select('ets')
checkres7 %>%
gg_tsresiduals()
#The log ets appears to have the most accurate forecast based on the RMSE scores. Based on the ACF, it appears that the residuals are not autocorrelated. However, the plot of the residuals does not appear to resemble white noise.
#Question 7 Part f
nz_cval <- nz %>%
slice(1:(n() - 3)) %>%
stretch_tsibble(.init = 36, .step = 3)
nz_cval %>%
model(
ets = ETS(Arrivals),
log_ets = ETS(log(Arrivals)),
snaive = SNAIVE(Arrivals),
stl = decomposition_model(STL(log(Arrivals)), ETS(season_adjust))
) %>%
forecast(h = 3) %>%
accuracy(nz)
## # A tibble: 4 × 11
## .model Origin .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ets NZ Test 4627. 15327. 11799. 2.23 6.45 0.793 0.797 0.283
## 2 log_ets NZ Test 4388. 15047. 11566. 1.99 6.36 0.778 0.782 0.268
## 3 snaive NZ Test 8244. 18768. 14422. 3.83 7.76 0.970 0.976 0.566
## 4 stl NZ Test 4252. 15618. 11873. 2.04 6.25 0.798 0.812 0.244
#The log_ets has the lowest RMSE score and appears to be the best.
#Question 8
#Question 8 Part a
q8 <- aus_production %>%
select(Quarter, Cement) %>%
as_tsibble(index = Quarter)
autoplot(q8)
## Plot variable not specified, automatically selected `.vars = Cement`
#Question 8 Part b
thelightattheend <- q8 %>%
stretch_tsibble(.init = 5, .step = 4)
lastvar <- thelightattheend %>%
model(ETS(Cement),
SNAIVE(Cement)) %>%
forecast(h= "1 year")
reallast <- accuracy(lastvar, q8) %>%
mutate(MSE = (RMSE)^2) %>%
select(MSE, RMSE)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 3 observations are missing between 2010 Q3 and 2011 Q1
reallast
## # A tibble: 2 × 2
## MSE RMSE
## <dbl> <dbl>
## 1 9962. 99.8
## 2 18028. 134.
#It appears that the ETS is the most accurate. This dataset is very long so ETS should work better for it.