Data 624 Assignment 5

Load packages

library(fpp3)
library(distributional)
library(RColorBrewer)
library(seasonal)
library(mlbench)
library(fmsb)
library(stringr)

Instructions

Do exercises 8.1, 8.5, 8.6, 8.7, 8.8, 8.9 in Hyndman. Please submit both the link to your Rpubs and the .pdf file with your run code

Exercise 8.1

Consider the the number of pigs slaughtered in Victoria, available in the aus_livestock dataset.

Use the ETS() function to estimate the equivalent model for simple exponential smoothing. Find the optimal values of lambda and l-naught, and generate forecasts for the next four months.

The optimal values for alpha and l-naught for a SES model can be found by using ETS and piping the model into report(). These optimal values are selected automatically through Maximum Likelihood Estimation. The dataframes df_alpha and df_l display a summary table of optimal values for alpha and l-naught respectively.

aus_livestock_pigs <- aus_livestock %>% filter(Animal == 'Pigs')
aus_states <- aus_livestock %>% filter(Animal == 'Pigs') %>% distinct(State) 
aus_states <- lapply(aus_states,as.character)

pigs_model <- aus_livestock_pigs %>% 
  model(ANN = ETS(Count ~ error("A")+trend("N")+season("N")))

df_alpha <- data.frame()
df_l <- data.frame()

for (i in aus_states$State) {
  new_row <- pigs_model %>% 
    filter(State == i) %>% 
    report() %>% 
    tidy() %>% 
    filter(term == "alpha")
  
  df_alpha <- rbind(df_alpha,new_row)
}

## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.4984782 
## 
##   Initial states:
##      l[0]
##  1659.057
## 
##   sigma^2:  379348.4
## 
##      AIC     AICc      BIC 
## 10701.17 10701.22 10714.15 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.2655419 
## 
##   Initial states:
##      l[0]
##  107861.1
## 
##   sigma^2:  129400339
## 
##      AIC     AICc      BIC 
## 13955.55 13955.59 13968.52 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.274479 
## 
##   Initial states:
##      l[0]
##  317.3623
## 
##   sigma^2:  17986.84
## 
##      AIC     AICc      BIC 
## 8999.935 8999.979 9012.908 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.2361812 
## 
##   Initial states:
##      l[0]
##  79572.86
## 
##   sigma^2:  49879356
## 
##      AIC     AICc      BIC 
## 13423.60 13423.65 13436.58 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.2812892 
## 
##   Initial states:
##     l[0]
##  43108.7
## 
##   sigma^2:  44439013
## 
##      AIC     AICc      BIC 
## 13359.16 13359.20 13372.13 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.1821152 
## 
##   Initial states:
##      l[0]
##  12365.22
## 
##   sigma^2:  915519.9
## 
##      AIC     AICc      BIC 
## 11192.79 11192.84 11205.77 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.3221247 
## 
##   Initial states:
##      l[0]
##  100646.6
## 
##   sigma^2:  87480760
## 
##      AIC     AICc      BIC 
## 13737.10 13737.14 13750.07 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.214573 
## 
##   Initial states:
##     l[0]
##  43478.4
## 
##   sigma^2:  25777782
## 
##      AIC     AICc      BIC 
## 13055.27 13055.32 13068.24

for (i in aus_states$State) {
  new_row <- pigs_model %>% 
    filter(State == i) %>% 
    report() %>% 
    tidy() %>% 
    filter(term == "l[0]")
  df_l <- rbind(df_l,new_row)
}

## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.4984782 
## 
##   Initial states:
##      l[0]
##  1659.057
## 
##   sigma^2:  379348.4
## 
##      AIC     AICc      BIC 
## 10701.17 10701.22 10714.15 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.2655419 
## 
##   Initial states:
##      l[0]
##  107861.1
## 
##   sigma^2:  129400339
## 
##      AIC     AICc      BIC 
## 13955.55 13955.59 13968.52 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.274479 
## 
##   Initial states:
##      l[0]
##  317.3623
## 
##   sigma^2:  17986.84
## 
##      AIC     AICc      BIC 
## 8999.935 8999.979 9012.908 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.2361812 
## 
##   Initial states:
##      l[0]
##  79572.86
## 
##   sigma^2:  49879356
## 
##      AIC     AICc      BIC 
## 13423.60 13423.65 13436.58 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.2812892 
## 
##   Initial states:
##     l[0]
##  43108.7
## 
##   sigma^2:  44439013
## 
##      AIC     AICc      BIC 
## 13359.16 13359.20 13372.13 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.1821152 
## 
##   Initial states:
##      l[0]
##  12365.22
## 
##   sigma^2:  915519.9
## 
##      AIC     AICc      BIC 
## 11192.79 11192.84 11205.77 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.3221247 
## 
##   Initial states:
##      l[0]
##  100646.6
## 
##   sigma^2:  87480760
## 
##      AIC     AICc      BIC 
## 13737.10 13737.14 13750.07 
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.214573 
## 
##   Initial states:
##     l[0]
##  43478.4
## 
##   sigma^2:  25777782
## 
##      AIC     AICc      BIC 
## 13055.27 13055.32 13068.24

df_alpha

## # A tibble: 8 × 5
##   Animal State                        .model term  estimate
##   <fct>  <fct>                        <chr>  <chr>    <dbl>
## 1 Pigs   Australian Capital Territory ANN    alpha    0.498
## 2 Pigs   New South Wales              ANN    alpha    0.266
## 3 Pigs   Northern Territory           ANN    alpha    0.274
## 4 Pigs   Queensland                   ANN    alpha    0.236
## 5 Pigs   South Australia              ANN    alpha    0.281
## 6 Pigs   Tasmania                     ANN    alpha    0.182
## 7 Pigs   Victoria                     ANN    alpha    0.322
## 8 Pigs   Western Australia            ANN    alpha    0.215

df_l

## # A tibble: 8 × 5
##   Animal State                        .model term  estimate
##   <fct>  <fct>                        <chr>  <chr>    <dbl>
## 1 Pigs   Australian Capital Territory ANN    l[0]     1659.
## 2 Pigs   New South Wales              ANN    l[0]   107861.
## 3 Pigs   Northern Territory           ANN    l[0]      317.
## 4 Pigs   Queensland                   ANN    l[0]    79573.
## 5 Pigs   South Australia              ANN    l[0]    43109.
## 6 Pigs   Tasmania                     ANN    l[0]    12365.
## 7 Pigs   Victoria                     ANN    l[0]   100647.
## 8 Pigs   Western Australia            ANN    l[0]    43478.

df_forecasts <- data.frame()
for (i in aus_states$State) {
  print(i)
  aus_livestock_pigs_temp <- aus_livestock_pigs %>% filter(State == i) 
  new_row <- pigs_model %>% filter(State == i) %>% forecast(h = 4) %>% hilo()
  df_forecasts <- bind_rows(df_forecasts,new_row)
  print_plot <- pigs_model %>% 
    filter(State == i) %>% 
    forecast(h = 4) %>% 
    autoplot() + autolayer(aus_livestock_pigs_temp) + 
    labs(title = paste("4 Month Forecast of Pig Slaughter for ",i))
  print(print_plot)
}

## [1] "Australian Capital Territory"

## [1] "New South Wales"

## [1] "Northern Territory"

## [1] "Queensland"

## [1] "South Australia"

## [1] "Tasmania"

## [1] "Victoria"

## [1] "Western Australia"

df_forecasts

##    Animal                        State .model    Month
## 1    Pigs Australian Capital Territory    ANN 2019 Jan
## 2    Pigs Australian Capital Territory    ANN 2019 Feb
## 3    Pigs Australian Capital Territory    ANN 2019 Mar
## 4    Pigs Australian Capital Territory    ANN 2019 Apr
## 5    Pigs              New South Wales    ANN 2019 Jan
## 6    Pigs              New South Wales    ANN 2019 Feb
## 7    Pigs              New South Wales    ANN 2019 Mar
## 8    Pigs              New South Wales    ANN 2019 Apr
## 9    Pigs           Northern Territory    ANN 2019 Jan
## 10   Pigs           Northern Territory    ANN 2019 Feb
## 11   Pigs           Northern Territory    ANN 2019 Mar
## 12   Pigs           Northern Territory    ANN 2019 Apr
## 13   Pigs                   Queensland    ANN 2019 Jan
## 14   Pigs                   Queensland    ANN 2019 Feb
## 15   Pigs                   Queensland    ANN 2019 Mar
## 16   Pigs                   Queensland    ANN 2019 Apr
## 17   Pigs              South Australia    ANN 2019 Jan
## 18   Pigs              South Australia    ANN 2019 Feb
## 19   Pigs              South Australia    ANN 2019 Mar
## 20   Pigs              South Australia    ANN 2019 Apr
## 21   Pigs                     Tasmania    ANN 2019 Jan
## 22   Pigs                     Tasmania    ANN 2019 Feb
## 23   Pigs                     Tasmania    ANN 2019 Mar
## 24   Pigs                     Tasmania    ANN 2019 Apr
## 25   Pigs                     Victoria    ANN 2019 Jan
## 26   Pigs                     Victoria    ANN 2019 Feb
## 27   Pigs                     Victoria    ANN 2019 Mar
## 28   Pigs                     Victoria    ANN 2019 Apr
## 29   Pigs            Western Australia    ANN 2019 Jan
## 30   Pigs            Western Australia    ANN 2019 Feb
## 31   Pigs            Western Australia    ANN 2019 Mar
## 32   Pigs            Western Australia    ANN 2019 Apr
##                        Count        .mean                          80%
## 1  N(6.620284e-78, 379348.4) 6.620284e-78 [  -789.3238,    789.3238]80
## 2  N(6.620284e-78, 473609.1) 6.620284e-78 [  -881.9543,    881.9543]80
## 3  N(6.620284e-78, 567869.8) 6.620284e-78 [  -965.7405,    965.7405]80
## 4  N(6.620284e-78, 662130.5) 6.620284e-78 [ -1042.8164,   1042.8164]80
## 5     N(73470.44, 129400339) 7.347044e+04 [ 58892.2429,  88048.6355]80
## 6     N(73470.44, 138524684) 7.347044e+04 [ 58387.0246,  88553.8538]80
## 7     N(73470.44, 147649028) 7.347044e+04 [ 57898.1887,  89042.6897]80
## 8     N(73470.44, 156773372) 7.347044e+04 [ 57424.2379,  89516.6405]80
## 9  N(1.376158e-17, 17986.84) 1.376158e-17 [  -171.8753,    171.8753]80
## 10 N(1.376158e-17, 19341.94) 1.376158e-17 [  -178.2322,    178.2322]80
## 11 N(1.376158e-17, 20697.05) 1.376158e-17 [  -184.3700,    184.3700]80
## 12 N(1.376158e-17, 22052.15) 1.376158e-17 [  -190.3100,    190.3100]80
## 13     N(97840.26, 49879356) 9.784026e+04 [ 88789.2620, 106891.2594]80
## 14     N(97840.26, 52661704) 9.784026e+04 [ 88540.2480, 107140.2733]80
## 15     N(97840.26, 55444053) 9.784026e+04 [ 88297.7299, 107382.7914]80
## 16     N(97840.26, 58226401) 9.784026e+04 [ 88061.2243, 107619.2970]80
## 17     N(109006.2, 44439013) 1.090062e+05 [100463.0052, 117549.3153]80
## 18     N(109006.2, 47955188) 1.090062e+05 [100131.4561, 117880.8644]80
## 19     N(109006.2, 51471362) 1.090062e+05 [ 99811.8550, 118200.4655]80
## 20     N(109006.2, 54987537) 1.090062e+05 [ 99502.9964, 118509.3241]80
## 21     N(2829.825, 915519.9) 2.829825e+03 [  1603.6009,   4056.0499]80
## 22       N(2829.825, 945884) 2.829825e+03 [  1583.4324,   4076.2185]80
## 23     N(2829.825, 976248.1) 2.829825e+03 [  1563.5850,   4096.0659]80
## 24      N(2829.825, 1006612) 2.829825e+03 [  1544.0439,   4115.6069]80
## 25     N(95186.56, 87480760) 9.518656e+04 [ 83200.0583, 107173.0566]80
## 26     N(95186.56, 96558142) 9.518656e+04 [ 82593.5188, 107779.5961]80
## 27    N(95186.56, 105635524) 9.518656e+04 [ 82014.8802, 108358.2347]80
## 28    N(95186.56, 114712906) 9.518656e+04 [ 81460.6133, 108912.5016]80
## 29     N(68591.56, 25777782) 6.859156e+04 [ 62084.8884,  75098.2308]80
## 30     N(68591.56, 26964632) 6.859156e+04 [ 61936.7852,  75246.3339]80
## 31     N(68591.56, 28151482) 6.859156e+04 [ 61791.9071,  75391.2120]80
## 32     N(68591.56, 29338332) 6.859156e+04 [ 61650.0522,  75533.0670]80
##                            95%
## 1  [-1207.1666,   1207.1666]95
## 2  [-1348.8328,   1348.8328]95
## 3  [-1476.9727,   1476.9727]95
## 4  [-1594.8501,   1594.8501]95
## 5  [51175.0120,  95765.8664]95
## 6  [50402.3472,  96538.5312]95
## 7  [49654.7372,  97286.1412]95
## 8  [48929.8920,  98010.9863]95
## 9  [ -262.8606,    262.8606]95
## 10 [ -272.5826,    272.5826]95
## 11 [ -281.9696,    281.9696]95
## 12 [ -291.0540,    291.0540]95
## 13 [83997.9527, 111682.5687]95
## 14 [83617.1187, 112063.4027]95
## 15 [83246.2192, 112434.3021]95
## 16 [82884.5152, 112796.0061]95
## 17 [95940.5321, 122071.7884]95
## 18 [95433.4715, 122578.8489]95
## 19 [94944.6838, 123067.6366]95
## 20 [94472.3253, 123539.9952]95
## 21 [  954.4769,   4705.1740]95
## 22 [  923.6317,   4736.0192]95
## 23 [  893.2778,   4766.3731]95
## 24 [  863.3923,   4796.2585]95
## 25 [76854.7889, 113518.3260]95
## 26 [75927.1668, 114445.9480]95
## 27 [75042.2154, 115330.8995]95
## 28 [74194.5374, 116178.5775]95
## 29 [58640.4647,  78542.6544]95
## 30 [58413.9605,  78769.1587]95
## 31 [58192.3886,  78990.7306]95
## 32 [57975.4401,  79207.6791]95

Compute a 95% prediction interval for the first forecast using ^y±1.96s where s is the standard deviation of the residuals. Compare your interval with the interval produced by R.

I can compute the 95% prediction interval by using the variance given in the distribution class column, Count, which is represented in the second term. I had difficulty extracting this, and ended up doing so by using a regex to take any number between a comma with whitespace and a closed parenthesis. I can see that this manual calculation of prediction interval gives the approximate same prediction interval as the one calculated by R, with some precision removed due to the use of a string to extract the values and 1.96 as a rounded representation of Z score.

options(scipen = 10)
df_forecasts %>% filter(month(Month) == 1) %>% 
  mutate(string_dist = as.character(Count)) %>% 
  mutate(
    var = as.numeric(str_extract(string_dist, "(?<=,\\s)\\d+(?=\\))"))) %>% 
  mutate (calculated_PI_lower = .mean - 1.96*sqrt(var)) %>%
  mutate (calculated_PI_upper = .mean + 1.96*sqrt(var))

##   Animal                        State .model    Month                     Count
## 1   Pigs Australian Capital Territory    ANN 2019 Jan N(6.620284e-78, 379348.4)
## 2   Pigs              New South Wales    ANN 2019 Jan    N(73470.44, 129400339)
## 3   Pigs           Northern Territory    ANN 2019 Jan N(1.376158e-17, 17986.84)
## 4   Pigs                   Queensland    ANN 2019 Jan     N(97840.26, 49879356)
## 5   Pigs              South Australia    ANN 2019 Jan     N(109006.2, 44439013)
## 6   Pigs                     Tasmania    ANN 2019 Jan     N(2829.825, 915519.9)
## 7   Pigs                     Victoria    ANN 2019 Jan     N(95186.56, 87480760)
## 8   Pigs            Western Australia    ANN 2019 Jan     N(68591.56, 25777782)
##          .mean                          80%                         95%
## 1 6.620284e-78 [  -789.3238,    789.3238]80 [-1207.1666,   1207.1666]95
## 2 7.347044e+04 [ 58892.2429,  88048.6355]80 [51175.0120,  95765.8664]95
## 3 1.376158e-17 [  -171.8753,    171.8753]80 [ -262.8606,    262.8606]95
## 4 9.784026e+04 [ 88789.2620, 106891.2594]80 [83997.9527, 111682.5687]95
## 5 1.090062e+05 [100463.0052, 117549.3153]80 [95940.5321, 122071.7884]95
## 6 2.829825e+03 [  1603.6009,   4056.0499]80 [  954.4769,   4705.1740]95
## 7 9.518656e+04 [ 83200.0583, 107173.0566]80 [76854.7889, 113518.3260]95
## 8 6.859156e+04 [ 62084.8884,  75098.2308]80 [58640.4647,  78542.6544]95
##           string_dist       var calculated_PI_lower calculated_PI_upper
## 1  N(6.6e-78, 379348)    379348          -1207.1882           1207.1882
## 2 N(73470, 129400339) 129400339          51174.6023          95766.2761
## 3   N(1.4e-17, 17987)     17987           -262.8666            262.8666
## 4  N(97840, 49879356)  49879356          83997.6983         111682.8231
## 5 N(109006, 44439013)  44439013          95940.2920         122072.0285
## 6     N(2830, 915520)    915520            954.4424           4705.2085
## 7  N(95187, 87480760)  87480760          76854.4521         113518.6628
## 8  N(68592, 25777782)  25777782          58640.2819          78542.8373

options(scipen = 0)

Exercise 8.5

Data set global_economy contains the annual Exports from many countries. Select one country to analyse.

Plot the Exports series and discuss the main features of the data.

Using Indonesia as the country to analyze, I plotted exports over time and can see that exports as a percentage of GDP appear to increase up until the late 1990s, where it begins to decrease again. The plot shows one major spike (at the late 1990s mark), before a steep decline Since exports is expressed as a function of percent GDP, I also performed a transformation to retrieve exports in $USD rather than as a percent of GDP. Afterwards, I also used CPI to adjust for inflation, and plotted the result. After doing this, we can see that the biggest spike in exports was actually in 1980, followed by a decline and another peak in the 1990s. There is also another slight peak in the early 2010s.

global_economy_indonesia <- global_economy %>% 
  filter(Country == "Indonesia") %>% 
  mutate(exports_USD = Exports * GDP/100) %>% 
  mutate(adjusted_exports_USD = exports_USD / CPI * 100) %>% 
  drop_na()

global_economy_indonesia %>% autoplot(Exports)

global_economy_indonesia %>% 
  autoplot(adjusted_exports_USD) +
  labs(x = "Time", y = "$USD", 
       title = "Inflation Adjusted Exports for Indonesia")

Use an ETS(A,N,N) model to forecast the series, and plot the forecasts.

Forecasting the series with an ETS(A,N,N) model, I can see that the resulting forecast is similar to a Naive forecast due to its high smoothing parameter alpha (0.9999).

ge_indonesia_model <- global_economy_indonesia %>% 
  model(ANN = ETS(adjusted_exports_USD ~ error("A")+trend("N")+season("N")))
global_economy_indonesia_temp <- 
  global_economy_indonesia %>% 
  select(adjusted_exports_USD)

fc <- ge_indonesia_model %>% forecast(h = 10) 

fc %>% autoplot() + autolayer(global_economy_indonesia_temp) +
  labs(x = "Time", y = "$USD", 
       title = "Inflation Adjusted Exports for Indonesia")

ge_indonesia_model %>% report()

## Series: adjusted_exports_USD 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##          l[0]
##  164269758014
## 
##   sigma^2:  1.591754e+21
## 
##      AIC     AICc      BIC 
## 2694.258 2694.769 2700.054

Compute the RMSE values for the training data.

I used the accuracy function on the model to compute the RMSE of the training data. Since RMSE is influenced by scale, it is difficult to interpret on its own. As indicated previously, the forecast is similar to the Naive forecast, and this is evident in the RMSSE which is just barely under 1. The scaled errors use the naive forecasts to account for the scale of the error, so having an RMSSE of close to 1 would indicate that the model is close to the Naive forecast.

accuracy(ge_indonesia_model)

## # A tibble: 1 × 11
##   Country   .model .type        ME    RMSE     MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <fct>     <chr>  <chr>     <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Indonesia ANN    Traini… -3.68e8 3.91e10 2.79e10 -2.23  15.2  1.01 0.999 0.157

Compare the results to those from an ETS(A,A,N) model. (Remember that the trended model is using one more parameter than the simpler model.) Discuss the merits of the two forecasting methods for this data set.

For this dataset, the trended model also takes into account the general downward trend in exports. While taking this general downward trend may be beneficial, it appears that the latest values are exhibiting a local upward trend. This local upward trend is therefore not shown in the forecast using the trended model, which may be a disadvantage. Looking at the AICc, it also appears that the untrended model is a better fit to the data as it exhibits a lower information criterion..

ge_indonesia_model2 <- global_economy_indonesia %>% 
  model(ANN = ETS(adjusted_exports_USD ~ error("A")+trend("A")+season("N")))

fc2 <- ge_indonesia_model2 %>% forecast(h = 10) 
fc2 %>% autoplot() + autolayer(global_economy_indonesia_temp) +
  labs(x = "Time", y = "$USD", 
       title = "Inflation Adjusted Exports for Indonesia")

accuracy(ge_indonesia_model2)

## # A tibble: 1 × 11
##   Country   .model .type       ME    RMSE     MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <fct>     <chr>  <chr>    <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 Indonesia ANN    Train… -6.58e9 4.14e10 2.97e10 -4.90  16.6  1.07  1.06 0.0544

ge_indonesia_model %>% report()

## Series: adjusted_exports_USD 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##          l[0]
##  164269758014
## 
##   sigma^2:  1.591754e+21
## 
##      AIC     AICc      BIC 
## 2694.258 2694.769 2700.054

ge_indonesia_model2 %>% report()

## Series: adjusted_exports_USD 
## Model: ETS(A,A,N) 
##   Smoothing parameters:
##     alpha = 0.9998997 
##     beta  = 0.07124248 
## 
##   Initial states:
##         l[0]        b[0]
##  45614182461 22327694439
## 
##   sigma^2:  1.861234e+21
## 
##      AIC     AICc      BIC 
## 2704.109 2705.443 2713.769

Compare the forecasts from both methods. Which do you think is best?

To assess the accuracy of forecasting for each method, I split the observation data into a training set and a test set. Using the testing data, I assessed the accuracy of the forecast with the accuracy() function. The forecast of the SES model, without trend, appeared to be a better forecast as the RMSE was lower.

global_economy_indonesia_train <- 
  global_economy_indonesia %>% filter(Year < 2008)

mod1 <- global_economy_indonesia_train %>% 
  model(ANN = ETS(adjusted_exports_USD ~ error("A")+trend("N")+season("N")))

SES_fc1 <- mod1 %>% forecast(h=10) 
SES_fc1 %>% autoplot() + 
  autolayer(global_economy_indonesia_temp) +
  labs(x = "Time", y = "$USD", 
       title = "Inflation Adjusted Exports for Indonesia")

mod2 <- global_economy_indonesia_train %>% 
  model(AAN = ETS(adjusted_exports_USD ~ error("A")+trend("A")+season("N")))

T_fc2 <- mod2 %>% forecast(h=10)
T_fc2 %>% autoplot() + autolayer(global_economy_indonesia_temp) +
  labs(x = "Time", y = "$USD", 
       title = "Inflation Adjusted Exports for Indonesia")

global_economy_indonesia_test <-
  global_economy_indonesia %>% filter(Year >= 2008)

SES_fc1 %>% accuracy(global_economy_indonesia_test)

## # A tibble: 1 × 11
##   .model Country   .type        ME    RMSE     MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <fct>     <chr>     <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ANN    Indonesia Test    1.47e10 3.32e10 2.77e10  5.87  15.5   NaN   NaN 0.581

T_fc2 %>% accuracy(global_economy_indonesia_test)

## # A tibble: 1 × 11
##   .model Country   .type        ME    RMSE     MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <fct>     <chr>     <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAN    Indonesia Test    2.19e10 3.60e10 2.83e10  10.5  15.2   NaN   NaN 0.558

Calculate a 95% prediction interval for the first forecast for each model, using the RMSE values and assuming normal errors. Compare your intervals with those produced using R.

Using the accuracy function on the model, I can find the RMSE. Assuming that distribution of future observations is normal, the 95% prediction interval for the one step ahead forecast can be expressed by the forecast +/- 1.96 * RMSE, which is used as an estimate of the standard deviation. The 95% prediction interval calculated with RMSE is slightly different than the one calculated in R, but is accurate within 5%.

PI_fc <- ge_indonesia_model %>% forecast(h=1) 
PI_fc2 <- ge_indonesia_model2 %>% forecast(h=1) 

#RMSE
RMSE1 <- ge_indonesia_model %>% accuracy()
lower_PI1 <- PI_fc[5] - 1.96 * RMSE1[5] 
upper_PI1 <- PI_fc[5] + 1.96 * RMSE1[5] 
lower_PI1

##         .mean
## 1 68848073967

upper_PI1

##          .mean
## 1 222146273620

PI_fc %>% hilo()

## # A tsibble: 1 x 7 [1Y]
## # Key:       Country, .model [1]
##   Country   .model  Year
##   <fct>     <chr>  <dbl>
## 1 Indonesia ANN     2018
## # ℹ 4 more variables: adjusted_exports_USD <dist>, .mean <dbl>, `80%` <hilo>,
## #   `95%` <hilo>

RMSE2 <-ge_indonesia_model2 %>% accuracy()
lower_PI2 <- PI_fc2[5] - 1.96 * RMSE2[5]  
upper_PI2 <- PI_fc2[5] + 1.96 * RMSE2[5] 
lower_PI2

##         .mean
## 1 62730723393

upper_PI2

##        .mean
## 1 2.2508e+11

PI_fc2%>% hilo()

## # A tsibble: 1 x 7 [1Y]
## # Key:       Country, .model [1]
##   Country   .model  Year
##   <fct>     <chr>  <dbl>
## 1 Indonesia ANN     2018
## # ℹ 4 more variables: adjusted_exports_USD <dist>, .mean <dbl>, `80%` <hilo>,
## #   `95%` <hilo>

Exercise 8.6

Forecast the Chinese GDP from the global_economy data set using an ETS model. Experiment with the various options in the ETS() function to see how much the forecasts change with damped trend, or with a Box-Cox transformation. Try to develop an intuition of what each is doing to the forecasts.

I created three different forecasts with (1) a simple exponential smoothing model, (2) a trended model, and (3) a damped trended model. In the simple exponential smoothing model, we can see in this specific dataset that it acted similar to the naive forecasting method. This is likely caused by the steep incline in China’s GDP over the last 20 years, which weights the values towards the higher end. In the trended model, we can see that the forecast with this data is very similar to the most recent slopes probably due to the steep incline at a consistent rate. The damped trend model exhibits a very similar initial slope, but clearly tapers off towards the latter part of the forecast.

global_economy_china <- global_economy %>% filter(Country == "China")

ge_china_models <- global_economy_china %>% 
  model(SES = ETS(GDP ~ error("A")+trend("N")+season("N")),
        Trend = ETS(GDP ~ error("A")+trend("A")+season("N")),
        dampedTrend = ETS(GDP ~ error("A")+trend("Ad")+season("N"))
        ) 

ge_china_models %>% forecast(h = 30) %>% 
  autoplot(level = NULL) + autolayer(global_economy_china)

After transforming the data with a log function (which is the same as a Box-Cox transformation using a lambda of 0, since the ideal lambda was determined to be close to 0 using the Guerrero features), I created three different forecasts again with the same three models. Here, the damped trend is shown better as the forecasts diverge more quickly from the trended model. The SES model again appears similar to the naive forecast. This is due to its high alpha value of 0.9999, which essentially means the weight on the latest observation is very high. Since there is no trend added, this results in a forecast that is almost identical to the naive forecast.

global_economy_china %>% features(GDP, features = guerrero)

## # A tibble: 1 × 2
##   Country lambda_guerrero
##   <fct>             <dbl>
## 1 China           -0.0345

global_economy_china_log <- global_economy_china %>% 
  mutate(logGDP = log(GDP))

ge_china_models_log <- global_economy_china_log %>% 
  model(SES = ETS(logGDP ~ error("A")+trend("N")+season("N")),
        Trend = ETS(logGDP ~ error("A")+trend("A")+season("N")),
        dampedTrend = ETS(logGDP ~ error("A")+trend("Ad")+season("N"))
        ) 
global_economy_china_log <- global_economy_china_log %>% select(logGDP)
ge_china_models_log %>% 
  forecast(h = 30) %>% 
  autoplot(level = NULL) + 
  autolayer(global_economy_china_log)

global_economy_china_log %>% 
  model(SES = ETS(logGDP ~ error("A")+trend("N")+season("N"))) %>%
  report()

## Series: logGDP 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##      l[0]
##  24.77468
## 
##   sigma^2:  0.0177
## 
##       AIC      AICc       BIC 
##  5.470638  5.915082 11.651967

global_economy_china_log %>% 
  model(Trend = ETS(logGDP ~ error("A")+trend("A")+season("N"))) %>%
  report()

## Series: logGDP 
## Model: ETS(A,A,N) 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.1079782 
## 
##   Initial states:
##      l[0]       b[0]
##  24.77007 0.04320226
## 
##   sigma^2:  0.0088
## 
##       AIC      AICc       BIC 
## -33.07985 -31.92600 -22.77763

global_economy_china_log %>% 
  model(dampedTrend = ETS(logGDP ~ error("A")+trend("Ad")+season("N"))) %>%
  report()

## Series: logGDP 
## Model: ETS(A,Ad,N) 
##   Smoothing parameters:
##     alpha = 0.9998999 
##     beta  = 0.2480956 
##     phi   = 0.979999 
## 
##   Initial states:
##      l[0]        b[0]
##  24.82881 0.003249391
## 
##   sigma^2:  0.0091
## 
##       AIC      AICc       BIC 
## -30.46070 -28.81364 -18.09804

Exercise 8.7

Find an ETS model for the Gas data from aus_production and forecast the next few years. Why is multiplicative seasonality necessary here? Experiment with making the trend damped. Does it improve the forecasts?

Multiplicative seasonality is necessary here because the seasonality of the time series has changed over time. The seasonality in the 1950s began with a much smaller amplitude, and has gradually increased in amplitude starting around 1970. Making the trend damped does not appear to have much affect, as the damping parameter is fairly high at 0.98. In comparison, damping parameter of 1 would imply no dampening effect. The model with the undamped trend is shown to be a better fit to the data, with a lower corrected AIC value.

aus_production_gas <- aus_production %>% select(Gas)
gas_fit <- aus_production_gas %>% model(ETS(Gas))
gas_fc <- gas_fit %>% forecast(h = 40) 
gas_fc %>% autoplot() + autolayer(aus_production_gas)

gas_fit %>% report()

## Series: Gas 
## Model: ETS(M,A,M) 
##   Smoothing parameters:
##     alpha = 0.6528545 
##     beta  = 0.1441675 
##     gamma = 0.09784922 
## 
##   Initial states:
##      l[0]       b[0]      s[0]    s[-1]    s[-2]     s[-3]
##  5.945592 0.07062881 0.9309236 1.177883 1.074851 0.8163427
## 
##   sigma^2:  0.0032
## 
##      AIC     AICc      BIC 
## 1680.929 1681.794 1711.389

gas_fit_damped <- aus_production_gas %>% 
  model(ETS(Gas ~ error("M")+trend("Ad")+season("M")))
gas_fc_damped <- gas_fit_damped %>% forecast(h = 40)
gas_fc_damped %>% autoplot() + autolayer(aus_production_gas)

gas_fit_damped %>% report()

## Series: Gas 
## Model: ETS(M,Ad,M) 
##   Smoothing parameters:
##     alpha = 0.6489044 
##     beta  = 0.1551275 
##     gamma = 0.09369372 
##     phi   = 0.98 
## 
##   Initial states:
##      l[0]       b[0]      s[0]    s[-1]   s[-2]     s[-3]
##  5.858941 0.09944006 0.9281912 1.177903 1.07678 0.8171255
## 
##   sigma^2:  0.0033
## 
##      AIC     AICc      BIC 
## 1684.028 1685.091 1717.873

Exercise 8.8

Recall your retail time series data (from Exercise 7 in Section 2.10).

Why is multiplicative seasonality necessary for this series?

Multiplicative seasonality is necessary for this series for the same reason it was necesssary in question 7 - the amplitude of seasonality changes over time.

set.seed(32)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
autoplot(myseries, Turnover)

Apply Holt-Winters’ multiplicative method to the data. Experiment with making the trend damped.

Using a damped trend, the forecasts appear to begin to exhibit a decrease trend almost immediately.

ms_fit <- myseries %>% model(ETS(Turnover)) 
ms_fc <- ms_fit %>% forecast(h=120) 
ms_fc %>% autoplot(level = NULL) + autolayer(myseries)

ms_fit_damped <- myseries %>% 
  model(ETS(Turnover ~ error("M")+trend("Ad")+season("M")))
ms_fc_damped <- ms_fit_damped %>% forecast(h=120) 
ms_fc_damped %>% autoplot(level = NULL) + autolayer(myseries)

Compare the RMSE of the one-step forecasts from the two methods. Which do you prefer?

Taking a training set consisting of all but the last observation, I calculated the RMSE of the one-step forecast for each method. The RMSE is lower for the damped trend method than for the undamped trend method. However, this only takes into account the RMSE for one point forecast. Since the model fit was determined to be better for the undamped trend model via the corrected AIC, I would still choose the undamped trend.

myseries_train <- myseries %>% filter(ym(Month) < "2018-12-01")
myseries_train_fit <- myseries_train %>% model(ETS(Turnover))
myseries_train_fc <- myseries_train_fit %>% forecast(h=1)
myseries_test <- myseries %>% filter(ym(Month) == "2018-12-01")
myseries_train_fc %>% accuracy(myseries_test)

## Warning: 1 error encountered
## [1] subscript out of bounds

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS(Turn… Sout… Pharmac… Test  -2.08  2.08  2.08 -1.49  1.49   NaN   NaN    NA

myseries_train2 <- myseries %>% filter(ym(Month) < "2018-12-01")
myseries_train_fit2 <- myseries_train2 %>% 
  model(ETS(Turnover ~ error("M")+trend("Ad")+season("M")))
myseries_train_fc2 <- myseries_train_fit2 %>% forecast(h=1)
myseries_test2 <- myseries %>% filter(ym(Month) == "2018-12-01")
myseries_train_fc2 %>% accuracy(myseries_test2)

## Warning: 1 error encountered
## [1] subscript out of bounds

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "ETS(Tur… Sout… Pharmac… Test  -1.52  1.52  1.52 -1.09  1.09   NaN   NaN    NA

Looking at the accuracy of the model for each method, I can see that the RMSE for the damped model is slightly higher than the RMSE for the undamped model as well.

accuracy(ms_fit)

## # A tibble: 1 × 12
##   State   Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>   <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 South … Pharmac… ETS(T… Trai… 0.128  2.93  2.03 0.101  4.04 0.402 0.410 0.0545

accuracy(ms_fit_damped)

## # A tibble: 1 × 12
##   State   Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>   <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 South … Pharmac… "ETS(… Trai… 0.165  2.94  2.03 0.290  4.03 0.403 0.413 0.0326

Check that the residuals from the best method look like white noise.

The residuals appear homoscedastic, the distribution appears roughly normal. However, there are multiple spikes in the autocorrelations plot that are outside of the autocorrelations bounds. Since these spikes exceed the threshold of 5% significantly, this would imply the model is not a good fit. Due to this, I also checked the residuals of the damped trend. The same issue appear, as the residuals also look to be homoscedastic and with a normal distribution but the autocorrelation bounds are again exceeded.

ms_fit %>%  gg_tsresiduals()

ms_fit_damped %>%  gg_tsresiduals()

Now find the test set RMSE, while training the model to the end of 2010. Can you beat the seasonal naïve approach from Exercise 7 in Section 5.11?

Splitting the observations into a training set pre-2011 and a test set post-2011, I obtain a RMSE of 11.5. This does beat the seasonal naive approach, and can be seen visually in the graph of the forecasts against the actual observations. The seasonal naive plot shows that the actual data is beyond the bounds of both the 80% and 95% prediction intervals, while this is not the case for the Holt-Winters model.

myseries_train <- myseries %>% filter(year(Month) < 2011)
myseries_train_fit <- myseries_train %>% model(ETS(Turnover))
myseries_train_fc <- myseries_train_fit %>% forecast(h=96)
myseries_test <- myseries %>% filter(year(Month) >= 2011)
myseries_train_fc %>% accuracy(myseries_test)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS(Turn… Sout… Pharmac… Test   8.49  11.5  9.12  7.00  7.62   NaN   NaN 0.776

myseries_train_fc %>% autoplot() + autolayer(myseries)

myseries_train_naive_fit <- myseries_train %>% 
  model(seasonal_naive = SNAIVE(Turnover))
myseries_train_naive_fc <- myseries_train_naive_fit %>% forecast(h=96)
myseries_train_naive_fc %>% accuracy(myseries_test)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 seasonal… Sout… Pharmac… Test   18.5  22.7  18.8  15.1  15.5   NaN   NaN 0.912

myseries_train_naive_fc %>% autoplot() + autolayer(myseries)

Exercise 8.9

For the same retail data, try an STL decomposition applied to the Box-Cox transformed series, followed by ETS on the seasonally adjusted data. How does that compare with your best previous forecasts on the test set?

After performing an STL composition applied to the box-cox transformed series, and then modeling with ETS on the seasonally adjusted data, the resulting model had a 0.216 RMSE, which is significantly less than in previous exercises. An STL composition on the box-cox transformed series allowed the removal of seasonality from the data. Forecasting resulted in a a much lower RMSE, even when comparing the accuracy against the actual (box-cox transformed) observations and not just the seasonally adjusted observations.

#break up training data again
myseries_train <- myseries %>% 
  filter(year(Month) < 2011)

#find ideal lambda
myseries_train %>% features(Turnover, features = guerrero)

## # A tibble: 1 × 3
##   State           Industry                                       lambda_guerrero
##   <chr>           <chr>                                                    <dbl>
## 1 South Australia Pharmaceutical, cosmetic and toiletry goods r…          0.0611

#perform box_cox transform on the testing data, and rename
myseries_test <- myseries %>% filter(year(Month) >= 2011) %>% 
  mutate(box_cox = box_cox(Turnover,0.0611)) %>% 
  rename(season_adjust = box_cox) #rename to use as comparison for accuracy

#full series transform for graphing in autolayer
#could have done this step before breaking up the training/test
myseries_transformed_full <- myseries %>% 
  mutate(box_cox = box_cox(Turnover,0.0611)) %>%
  select(box_cox)
comp_full <- myseries_transformed_full %>% model(stl = STL(box_cox))
myseries_decomp_full <- components(comp_full)

#perform box_cox transform on training data
myseries_transformed <- myseries_train %>% 
  mutate(box_cox = box_cox(Turnover,0.0611))

#STL decomposition
comp <- myseries_transformed %>% model(stl = STL(box_cox))

#find components
myseries_decomp <- components(comp)

#fit model to seasonally adjusted data
SeA_myseries_fit <- myseries_decomp %>% 
  model(ETS(season_adjust)) %>% 
  rename(.model1 = .model)

#forecast 8 years ahead
SeA_myseries_fc <- SeA_myseries_fit %>% forecast(h = 96) 

#plot forecast with actual data
SeA_myseries_fc %>% autoplot() + 
  autolayer(myseries_transformed_full) +
  labs(x = "Time", y = "Box-Cox Transformed Turnover", 
       title = "8 Year Forecast based on 
Seasonally Adjusted Box-Cox Transformed Turnover")

#assess data against the actual observations
SeA_myseries_fc %>% accuracy(myseries_test)

## # A tibble: 1 × 12
##   .model   State Industry .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS(sea… Sout… Pharmac… Test  -0.154 0.216 0.171 -2.80  3.11   NaN   NaN 0.641