Hw #5

library(fpp3)
library(readxl)

Questions 1

1A

piggies <- aus_livestock %>%
  filter(Animal == "Pigs" & State == "Victoria") %>%
  filter(Month >= yearmonth(as.Date("2010-01-01")) & Month <= yearmonth(as.Date("2018-08-01")))


pigplot <- piggies %>%
  autoplot(Count) +
  labs(title = 'Pigs Slaughtered')
pigplot

fitpig <- piggies %>%
  model(ses = ETS(Count ~ error('A') + trend('N') + season('N')))

fitpig %>% report()

## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.2508843 
## 
##   Initial states:
##      l[0]
##  66303.48
## 
##   sigma^2:  48567138
## 
##      AIC     AICc      BIC 
## 2327.637 2327.877 2335.570

fourmonth <- fitpig %>%
  forecast(h = 4)


fourmonthplot <- fitpig %>%
  forecast(h = 4) %>%
  autoplot(piggies) +
  labs(title = 'Four Month Forecast Data')
fourmonthplot

alpha = 0.2508843 l0 = 66303.48

1B

firstfore <- fourmonth %>%
  pull(Count) %>%
  head(1)

standardDeviation <- augment(fitpig) %>%
  pull(.resid) %>%
  sd()

 
bottom <- firstfore - 1.96 * standardDeviation
top <- firstfore + 1.96 * standardDeviation
results <- c(bottom, top)
names(results) <- c('Lower', 'Upper')
results

## <distribution[2]>
##              Lower              Upper 
##  N(84133, 4.9e+07) N(110907, 4.9e+07)

hilo(fourmonth$Count, 95)

## <hilo[4]>
## [1] [83861.13, 111179.2]95 [83437.82, 111602.5]95 [83026.87, 112013.4]95
## [4] [82627.25, 112413.0]95

The 95% interval that I found is from 84,133 to 110,907

The 95% interval that R found were slightly wider (from 83,861.13 to 111,179.2)

Question 2

2A

bangEx <- global_economy %>%
  filter(Code == 'BGD')
head(bangEx)

## # A tsibble: 6 x 9 [1Y]
## # Key:       Country [1]
##   Country    Code   Year         GDP Growth   CPI Imports Exports Population
##   <fct>      <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
## 1 Bangladesh BGD    1960 4274893913. NA        NA    9.31   10.0    48199747
## 2 Bangladesh BGD    1961 4817580184.  6.06     NA   11.7    10.8    49592802
## 3 Bangladesh BGD    1962 5081413340.  5.45     NA   10.8    10.7    51030137
## 4 Bangladesh BGD    1963 5319458351. -0.456    NA   11.7     9.98   52532417
## 5 Bangladesh BGD    1964 5386054619. 11.0      NA   14.1    10.0    54129100
## 6 Bangladesh BGD    1965 5906636557.  1.61     NA   13.4     9.66   55834038

bangEx %>%
  autoplot(Exports)  +
  labs(title = 'Bangladesh Exports')

This data displays a downward trend from 1960 to around 1975, then the data displays a upward trend from around 1975 to around 2012. Then the data starts to trend downwards again.

2B

fitbang <- bangEx %>%
  model(ANN = ETS(Exports ~ error('A') + trend('N') + season('N')))

bangExfore <- fitbang %>%
  forecast(h = 10)

bangExfore %>% autoplot(bangEx) +
  labs(title = 'Bangladesh 10 year Exports Forecast')

2C

accuracy(fitbang)

## # A tibble: 1 × 11
##   Country    .model .type        ME  RMSE   MAE    MPE  MAPE  MASE RMSSE   ACF1
##   <fct>      <chr>  <chr>     <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 Bangladesh ANN    Training 0.0869  1.25 0.945 -0.891  12.0 0.983 0.991 0.0148

The RMSE for the training data is 1.253158.

2D

bangcomp <- bangEx %>%
  model(
    ANN = ETS(Exports ~ error('A') + trend('N') + season('N')),
    AAN = ETS(Exports ~ error('A') + trend('A') + season('N')))

accuracy(bangcomp)

## # A tibble: 2 × 11
##   Country    .model .type         ME  RMSE   MAE    MPE  MAPE  MASE RMSSE   ACF1
##   <fct>      <chr>  <chr>      <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 Bangladesh ANN    Training 0.0869   1.25 0.945 -0.891  12.0 0.983 0.991 0.0148
## 2 Bangladesh AAN    Training 0.00557  1.25 0.954 -1.90   12.2 0.993 0.989 0.0135

The AAN model has a slightly lower RMSE which means that it is a more accurate model for this data.

2E

bangcomp %>%
  forecast(h = 10) %>%
  autoplot(bangEx) +
  labs(title = 'Bangladesh Exports Model Comparison')

The forecast looks like the AAN model is better for forecasting this data. The ANN forecast shows the data leveling off which does not fit the overall trend of the data. The AAN model shows an upward trend in the data which seems like a much more accurate forecast with the trend of the data.

Question 3

chineseGDP <- global_economy %>%
  filter(Country == 'China')


chineseGDP %>% autoplot(GDP) +
  labs(title = 'Chinese GDP')

bigL <- chineseGDP %>%
  features(GDP, features = guerrero) %>%
  pull(lambda_guerrero)



chineseGDPComp <- chineseGDP %>%
  model(
    ETS = ETS(GDP),
    BoxCox = ETS(box_cox(GDP, bigL)),
    Damped = ETS(GDP ~ trend('Ad', phi = 0.9)),
    Log = ETS(log(GDP)))

chineseGDPComp %>%
  forecast(h = 20) %>%
  autoplot(chineseGDP, level = NULL) +
  labs(title = 'Chinese GDP ETS Forecast Options Comparison')

Question 4

aus_production %>%
  autoplot(Gas)

fitaus <- aus_production %>%
  model(Multiplicative = ETS(Gas ~ error("M") + trend("A") + season("M")))
report(fitaus)

## Series: Gas 
## Model: ETS(M,A,M) 
##   Smoothing parameters:
##     alpha = 0.6528545 
##     beta  = 0.1441675 
##     gamma = 0.09784922 
## 
##   Initial states:
##      l[0]       b[0]      s[0]    s[-1]    s[-2]     s[-3]
##  5.945592 0.07062881 0.9309236 1.177883 1.074851 0.8163427
## 
##   sigma^2:  0.0032
## 
##      AIC     AICc      BIC 
## 1680.929 1681.794 1711.389

fitaus %>%
  forecast(h = 36) %>%
  autoplot(aus_production)

fitagain <- aus_production %>%
  model(Damped = ETS(Gas ~ error("A") + trend("Ad") + season("N")))
report(fitagain)

## Series: Gas 
## Model: ETS(A,Ad,N) 
##   Smoothing parameters:
##     alpha = 0.08442082 
##     beta  = 0.01242805 
##     phi   = 0.9799999 
## 
##   Initial states:
##      l[0]      b[0]
##  5.919847 0.2516645
## 
##   sigma^2:  251.9636
## 
##      AIC     AICc      BIC 
## 2386.146 2386.544 2406.453

fitagain %>%
  forecast(h = 36) %>%
  autoplot(aus_production)

modelComparison <- aus_production %>%
  model(
    Multiplicative = ETS(Gas ~ error("M") + trend("A") + season("M")),
    Damped = ETS(Gas ~ error("A") + trend("Ad") + season("N")))

glance(modelComparison)

## # A tibble: 2 × 9
##   .model            sigma2 log_lik   AIC  AICc   BIC   MSE  AMSE     MAE
##   <chr>              <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 Multiplicative   0.00324   -831. 1681. 1682. 1711.  21.1  32.2  0.0413
## 2 Damped         252.       -1187. 2386. 2387. 2406. 246.  241.  11.7

multiplicative seasonality is necessary due to the fact that the seasonal variation seems to change proportionally to the level.

The multiplicative method slightly outperformed the damped version of the model across the board. This means that damping didn’t improve the model and that the multiplicative method is a stronger model.

Question 5

5A

set.seed(7777777)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))

myseries %>% autoplot(Turnover)

Multiplicative seasonality is good for this data because the magnitude of the seasonality fluctuation increases with the level.

5B

fitturn <- myseries %>%
  model(
    'Multiplicative Method' = ETS(Turnover ~ error('M') + trend('A') + season('M')),
    'Damped Method' = ETS(Turnover ~ error('M') + trend('Ad') + season('M'))
  )

HoltWinters <- fitturn %>% forecast(h = 36)

HoltWinters %>% autoplot(myseries, level = NULL)

5C

accuracy(fitturn) %>% select('.model', 'RMSE')

## # A tibble: 2 × 2
##   .model                 RMSE
##   <chr>                 <dbl>
## 1 Multiplicative Method  10.9
## 2 Damped Method          11.0

The multiplicative method has a slightly lower RMSE than the damped method. This means that this may be the more accurate model for the data.

5D

fitturn %>%
  select('Multiplicative Method') %>%
  gg_tsresiduals()

#better
fitturn %>%
  select('Damped Method') %>%
  gg_tsresiduals()

Both the residuals plot and histogram for the multiplicative method confirms that the residuals look like white noise with the exception of a few outliers. The ACF confirms that most of the residuals are within bounds. The damped method’s residuals look a little less like white noise and the residuals arent normally distributed.

5E

myseries_train <- myseries %>%
  filter(year(Month) < 2011)

fitty <- myseries_train %>%
  model(
    'Multiplicative Method' = ETS(Turnover ~ error('M') + trend('A') + season('M')),
    'Damped Method' = ETS(Turnover ~ error('M') + trend('Ad') + season('M')),
    'Seasonal Naive' = SNAIVE(Turnover))

comparison <- anti_join(myseries, myseries_train, by = c('State', 'Industry', 'Series ID', 'Month', 'Turnover'))
forecastResults <- fitty %>% forecast(comparison)

autoplot(comparison, Turnover) +
  autolayer(forecastResults, level = NULL) +
  labs(title = 'Forecast Method Comparison')

accuracy(fitty)

## # A tibble: 3 × 12
##   State  Indus…¹ .model .type     ME  RMSE   MAE    MPE  MAPE  MASE RMSSE   ACF1
##   <chr>  <chr>   <chr>  <chr>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 Victo… Hardwa… Multi… Trai… -0.453  9.86  7.22 -0.547  5.07 0.491 0.524 0.251 
## 2 Victo… Hardwa… Dampe… Trai…  0.491  9.78  7.22  0.214  5.05 0.490 0.519 0.0581
## 3 Victo… Hardwa… Seaso… Trai…  6.74  18.8  14.7   3.93   9.84 1     1     0.759 
## # … with abbreviated variable name ¹Industry

The Damped and Multiplicative methods both beat the Seasonal Niave method, with the Multiplicative method being the most accurate.

Question 6

6A

trippy <- tourism %>%
  summarise(Trips = sum(Trips))

trippy %>%
  autoplot(Trips)

The data is seasonal. There is a downward trend until around 2010, then there is a stronger upward trend.

6B

decomp <- trippy %>%
  model(STL(Trips)) %>% components()

decomp %>%
  as_tsibble() %>%
  autoplot(season_adjust)

6C

trippy %>%
  model(
    decomposition_model(STL(Trips),
                        ETS(season_adjust ~ error("A") + trend("Ad") + season("N")))) %>%
  forecast(h = "2 years") %>%
  autoplot(trippy)

6D

trippy %>%
  model(decomposition_model(STL(Trips),
                        ETS(season_adjust ~ error("A") + trend("A") + season("N")))) %>%
  forecast(h = "2 years") %>%
  autoplot(trippy)

6E

trippy %>%
  model(ETS(Trips)) %>%
  forecast(h = "2 years") %>%
  autoplot(trippy)

6F

fittrip <- trippy %>%
  model(
    Damped = decomposition_model(STL(Trips),
                                   ETS(season_adjust ~ error("A") + trend("Ad") + season("N"))), 
    Holt = decomposition_model(STL(Trips), 
                                  ETS(season_adjust ~ error("A") + trend("A") + season("N"))),
    ETS = ETS(Trips))

accuracy(fittrip)

## # A tibble: 3 × 10
##   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE     ACF1
##   <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 Damped Training 103.   763.  576. 0.367  2.72 0.607 0.629 -0.0174 
## 2 Holt   Training  99.7  763.  574. 0.359  2.71 0.604 0.628 -0.0182 
## 3 ETS    Training 105.   794.  604. 0.379  2.86 0.636 0.653 -0.00151

Using the RMSE value, the Holt method to the STL Decomposition is the slightly better in-sample method.

6G

fittrip %>%
  forecast(h = "2 years") %>%
  autoplot(trippy, level = NULL) +
  guides(colour=guide_legend(title="Forecast"))

The forecasts are all very similar therefore, the model with the smallest RMSE value is most likely to be the best. (Holt)

6H

goodest <- fittrip %>%
  select(Holt) 
goodest %>%
  gg_tsresiduals()

## Warning: Removed 4 row(s) containing missing values (geom_path).

## Warning: Removed 4 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing non-finite values (stat_bin).

augment(goodest) %>%
  features(.resid, ljung_box, lag = 24, dof = 5)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 Holt      33.3    0.0221

The residuals seem to be close white noise, but when I did the ljung box test I found that my p-value is less than 0.05. This means that the residuals are not white noise. I believe this is due to a huge outlier.

Question 7

7A

bigZ <- aus_arrivals %>% filter(Origin == 'NZ')

autoplot(bigZ, Arrivals) +
  ggtitle("Arrivals from New Zealand")

The data has an upward trend, also the number of arrivals fall each Q1.

7B

choochoo <- bigZ %>%
  slice(1:(nrow(bigZ)-(4*2)))   

choochoo %>%
  model(
    ETS(Arrivals ~ error("M") + trend("A") + season("M"))) %>%
  forecast(h="2 years") %>%
  autoplot(level = NULL) + 
  autolayer(bigZ, Arrivals) +
  labs(title = "Train vs Test")

7C

Multiplicative seasonality is important because the size of the seasonal pattern grows in proportion to the level of the trend.

7D

letssee <- anti_join(bigZ, choochoo,
                     by = c("Quarter", "Origin", "Arrivals"))

fitem <- choochoo %>%
  model(
    'ETS model' = ETS(Arrivals),
    'Additive to Log' = ETS(log(Arrivals) ~ error("A") + trend("A") + season("A")),
    'Seasonal Naïve Method' = SNAIVE(Arrivals),
    'STL decomposition to Log' = decomposition_model(STL(log(Arrivals)), 
                                                     ETS(season_adjust))) 

fofo <- fitem %>%
  forecast(h="2 years") 

fofo %>%
  autoplot(level = NULL) + 
  autolayer(letssee, Arrivals) +
  guides(colour=guide_legend(title="Forecast")) +
  labs(title = "New Zealand Visitors")

7E

fofo %>% accuracy(bigZ)

## # A tibble: 4 × 11
##   .model     Origin .type      ME   RMSE    MAE    MPE  MAPE  MASE RMSSE    ACF1
##   <chr>      <chr>  <chr>   <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 Additive … NZ     Test   -7093. 17765. 12851. -2.17   4.20 0.864 0.918  0.0352
## 2 ETS model  NZ     Test   -3495. 14913. 11421. -0.964  3.78 0.768 0.771 -0.0260
## 3 Seasonal … NZ     Test    9709. 18051. 17156.  3.44   5.80 1.15  0.933 -0.239 
## 4 STL decom… NZ     Test  -12535. 22723. 16172. -4.02   5.23 1.09  1.17   0.109

daddymodel <- fitem %>% select('ETS model')
daddymodel %>%
  gg_tsresiduals()

augment(daddymodel) %>%
  features(.resid, ljung_box)

## # A tibble: 1 × 3
##   .model    lb_stat lb_pvalue
##   <chr>       <dbl>     <dbl>
## 1 ETS model    1.29     0.256

Using the RMSE value, the best forecast is the ETS model’s forecast. When you look at the plot this also looks the best.

Both the acf plot and ljung box tests show that the ETS model passes the residual tests. Everything in the ACF plot is within the bounds and the ljung box value is more than 0.05.

7F

vallygirl <- bigZ %>%
  slice(1:(n() - 3)) %>%
  stretch_tsibble(.init = 36, .step = 3)

vallygirl %>% 
  model(
    'ETS model' = ETS(Arrivals),
    'Additive to Log' = ETS(log(Arrivals) ~ error("A") + trend("A") + season("A")),
    'Seasonal Naïve Method' = SNAIVE(Arrivals),
    'STL decomposition to Log' = decomposition_model(STL(log(Arrivals)), ETS(season_adjust))) %>%
  forecast(h = 3) %>%
  accuracy(bigZ)

## # A tibble: 4 × 11
##   .model          Origin .type    ME   RMSE    MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>           <chr>  <chr> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Additive to Log NZ     Test  -375. 14347. 11021. 0.200  5.83 0.741 0.746 0.309
## 2 ETS model       NZ     Test  4627. 15327. 11799. 2.23   6.45 0.793 0.797 0.283
## 3 Seasonal Naïve… NZ     Test  8244. 18768. 14422. 3.83   7.76 0.970 0.976 0.566
## 4 STL decomposit… NZ     Test  4252. 15618. 11873. 2.04   6.25 0.798 0.812 0.244

Using cross validation, the best method is Additive to Log. The previous best model, ETS model, is the second best using cross validation.

Question 8

8A

hardstuff <- aus_production %>%
  select(Quarter, Cement) %>%
  as_tsibble(index = Quarter)

autoplot(hardstuff)

## Plot variable not specified, automatically selected `.vars = Cement`

hardvally <- hardstuff %>%
  slice(1:(n()-4)) %>%                            
  stretch_tsibble(.init = 5, .step = 1)                    

cementmod <- hardvally %>%
  model(ETS(Cement),
        SNAIVE(Cement)) %>%
  forecast(h = "1 year")

8B

vallygirl <- hardstuff %>%
  stretch_tsibble(.init = 5, .step = 4)
 
cementmod2 <- vallygirl %>%
  model(ETS(Cement),
        SNAIVE(Cement)) %>%
  forecast(h= "1 year")

cementacc <- accuracy(cementmod2, hardstuff) %>%
  mutate(MSE = (RMSE)^2) %>%
  select(MSE, RMSE)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 3 observations are missing between 2010 Q3 and 2011 Q1

cementacc

## # A tibble: 2 × 2
##      MSE  RMSE
##    <dbl> <dbl>
## 1  9962.  99.8
## 2 18028. 134.

The ETS seems to be the most accurate model. I expected the ETS model to be best because it is designed for longer data sets and has no trouble estimating the parameters and trend if necessary.