Homework 5

library(readxl)
library(tidyquant)
library(fpp3)
library(moments)
library(tsibble)
library(tsibbledata)
library(ggplot2)
library(dplyr)
library(seasonal)
library(seasonalview)
library(fabletools)
library(fable)
library(ggfortify)
library(quantmod)
library(GGally)
library(feasts)

#Question 1

#Question 1 Part a
aus_livestock

## # A tsibble: 29,364 x 4 [1M]
## # Key:       Animal, State [54]
##       Month Animal                     State                        Count
##       <mth> <fct>                      <fct>                        <dbl>
##  1 1976 Jul Bulls, bullocks and steers Australian Capital Territory  2300
##  2 1976 Aug Bulls, bullocks and steers Australian Capital Territory  2100
##  3 1976 Sep Bulls, bullocks and steers Australian Capital Territory  2100
##  4 1976 Oct Bulls, bullocks and steers Australian Capital Territory  1900
##  5 1976 Nov Bulls, bullocks and steers Australian Capital Territory  2100
##  6 1976 Dec Bulls, bullocks and steers Australian Capital Territory  1800
##  7 1977 Jan Bulls, bullocks and steers Australian Capital Territory  1800
##  8 1977 Feb Bulls, bullocks and steers Australian Capital Territory  1900
##  9 1977 Mar Bulls, bullocks and steers Australian Capital Territory  2700
## 10 1977 Apr Bulls, bullocks and steers Australian Capital Territory  2300
## # … with 29,354 more rows

aus_pigs <- aus_livestock %>%
  filter(Animal == "Pigs") %>%
  filter(State == "Victoria") %>%
  filter(Month >= yearmonth(as.Date("2010-01-01")))

aus_pigs_expo <- aus_pigs %>%
  model(ETS(Count ~ error("A") + trend("N") + season("N")))

report(aus_pigs_expo)

## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.225021 
## 
##   Initial states:
##      l[0]
##  67178.49
## 
##   sigma^2:  49463947
## 
##      AIC     AICc      BIC 
## 2423.061 2423.292 2431.107

#The optimal value of alpha is 0.225021 and the optimal value of lambda is 67178.49.
aus_pigs_expo %>%
  forecast(h = 4) %>%
  autoplot(aus_pigs) +
  geom_line(aes(y = .fitted), col="#D55E00",
            data = augment(aus_pigs_expo)) +
  labs(title = "Forecast of Slaugtered Victorian Pigs using Simple Exponential Smoothing (h = 4)")

#Question 1 Part b
pigcast <- aus_pigs_expo %>%
  forecast(h = 4)
pigcast %>%
  hilo(95)

## # A tsibble: 4 x 7 [1M]
## # Key:       Animal, State, .model [1]
##   Animal State   .model    Month             Count  .mean                  `95%`
##   <fct>  <fct>   <chr>     <mth>            <dist>  <dbl>                 <hilo>
## 1 Pigs   Victor… "ETS(… 2019 Jan N(95295, 4.9e+07) 95295. [81510.15, 109079.2]95
## 2 Pigs   Victor… "ETS(… 2019 Feb N(95295, 5.2e+07) 95295. [81165.48, 109423.9]95
## 3 Pigs   Victor… "ETS(… 2019 Mar N(95295, 5.4e+07) 95295. [80829.01, 109760.4]95
## 4 Pigs   Victor… "ETS(… 2019 Apr N(95295, 5.7e+07) 95295. [80500.19, 110089.2]95

95295 - 1.96 *(sqrt(49463947)) #Manually took the point forecast out and took the sqrt of sigma squared, which was calculated in part a.

## [1] 81510.2

95295 + 1.96 *(sqrt(49463947))

## [1] 109079.8

#The prediction interval I manually calculated is [81510.2, 109079.8]. The prediction intervals R calculated are more precise since they are for each forecasted period, although they are still very close to my manually calculated values.

#Question 2

#Question 2 Part a 
global_economy %>%
  distinct(Country)

## # A tibble: 263 × 1
##    Country            
##    <fct>              
##  1 Afghanistan        
##  2 Albania            
##  3 Algeria            
##  4 American Samoa     
##  5 Andorra            
##  6 Angola             
##  7 Antigua and Barbuda
##  8 Arab World         
##  9 Argentina          
## 10 Armenia            
## # … with 253 more rows

brazil_econ <- global_economy %>%
  filter(Country == "Brazil")

brazil_train <- brazil_econ %>%
  filter(Year <= year(as.Date("2015-01-01")))

brazil_econ %>%
  autoplot(Exports) +
  labs(title = "Brazil Exports")

#Brazil's exports fluctuates a lot in the short run although there is a steady positive growth trend.   

#Question 2 Part b
brazil_econ_expo <- brazil_train %>%
  model(ETS(Exports ~ error("A") + trend("N") + season("N")))

brazil_econ_expo %>%
  forecast(h = 10)%>%
  autoplot(brazil_train) +
  labs(title = "Ten Year Forecast of Brazilian Exports") +
  labs(subtitle = "Using ETS(A,N,N)")

#Question 2 Part c
accuracy(brazil_econ_expo)

## # A tibble: 1 × 11
##   Country .model        .type    ME  RMSE   MAE    MPE  MAPE  MASE RMSSE    ACF1
##   <fct>   <chr>         <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 Brazil  "ETS(Exports… Trai… 0.121  1.58  1.18 -0.859  13.3 0.971 0.977 0.00195

#My RMSE is 1.58.

#Question 2 Part d
brazil_econ_linear <- brazil_train %>%
  model(ETS(Exports ~ error("A") + trend("A") + season("N")))

accuracy(brazil_econ_linear)

## # A tibble: 1 × 11
##   Country .model       .type      ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <fct>   <chr>        <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 Brazil  "ETS(Export… Trai… 1.27e-4  1.58  1.19 -2.24  13.5 0.977 0.974 0.00810

#The ANN model is almost identical to the naive method since there is no trend or seasonality. There is a slight trend in the data, so while the RMSEs are the same the AAN method is best for forecasting this data between the two. 

#Question 2 Part e
brazil_econ_linear %>%
  forecast(h = 10)%>%
  autoplot(brazil_train) +
  labs(title = "Ten Year Forecast of Brazil Exports") +
  labs(subtitle = "Using ETS(A,A,N)")

#After comparing the plots for the two methods, it definitely appears that the AAN is the best since it captures the slight trend present.

#Question 3

china_gdp <- global_economy %>%
  filter(Country == "China")
autoplot(china_gdp) +
  labs(title = "Chinese GDP")

## Plot variable not specified, automatically selected `.vars = GDP`

china_log <- china_gdp %>%
  mutate(GDP = log(GDP))
autoplot(china_log) +
  labs(title = "Logged Chinese GDP") #Logged GDP to smooth out the exponential-like growth China experienced

## Plot variable not specified, automatically selected `.vars = GDP`

china_ann <- china_gdp %>%
  model(ETS(GDP ~ error("A") + trend("N") + season("N")))

china_ann %>%
  forecast(h = 20)%>%
  autoplot(china_gdp) +
  labs(title = "Twenty Year Forecast of Chinese GDP (ETS = (A,N,N))")

#The very strong and clear upward trend is completely ignored by this model. Let's used the logged data and see if it looks better.

china_lann <- china_log %>%
  model(ETS(GDP ~ error("A") + trend("N") + season("N")))

china_lann %>%
  forecast(h = 20)%>%
  autoplot(china_log) +
  labs(title = "Twenty Year Forecast Logged of Chinese GDP") +
  labs(subtitle = "Using ETS(A,N,N)")

#This forecast looks a lot better, although it is still missing out on most of the upward trend. I think the logged AAN method will yield the best results.

china_aan <- china_gdp %>%
  model(ETS(GDP ~ error("A") + trend("A") + season("N")))

china_aan %>%
  forecast(h = 20)%>%
  autoplot(china_gdp) +
  labs(title = "Twenty Year Forecast of Chinese GDP") +
  labs(subtitle = "Using ETS(A,A,N)")

#This forecast looks a lot better than the previous ones. It definitely needs a damp since the growth will slow down in the future, but I will use AAN on the logged data first so I can compare.

china_laan <- china_log %>%
  model(ETS(GDP ~ error("A") + trend("A") + season("N")))

china_laan %>%
  forecast(h = 20)%>%
  autoplot(china_log) +
  labs(title = "Twenty Year Forecast of Chinese Logged GDP") +
  labs(subtitle = "Using ETS(A,A,N)")

#This forecast looks even better than the last one since the exponential-like growth has been smoothed out. However, the growth will slow down in the future so a damp is definitely necessary.

china_gdp %>%
  model(`Damped Holt's method` = ETS(GDP ~ error("A") +
                                   trend("Ad", phi = 0.9) + season("N"))) %>%
  forecast(h = 20) %>%
  autoplot(china_gdp) +
  labs(title = "Twenty Year Forecast of Chinese GDP") +
  labs(subtitle = "Using ETS(A,A,N) with a Damping Component")

#This forecast controls for limited growth into the future but is being forecasted off an uncontrolled-for exponential-like growth. I expect the logged damped that I will run next to work best.

china_log %>%
  model(`Damped Holt's method` = ETS(GDP ~ error("A") +
                                   trend("Ad", phi = 0.9) + season("N"))) %>%
  forecast(h = 20) %>%
  autoplot(china_log) +
  labs(title = "Twenty Year Forecast of Chinese Logged GDP") +
  labs(subtitle = "Using ETS(A,A,N) with a Damping Component")

#As expected, this forecast looks a lot better than all previous ones. However, none of the methods I have tried fully capture the exponential growth, at least in the short-run. Box-Cox is not applicable here since there is no seasonality.

#Question 4

aus_gas <- aus_production %>%
  select("Gas")

autoplot(aus_gas) +
  labs(title = "Australian Gas Production")

## Plot variable not specified, automatically selected `.vars = Gas`

#Multiplicative seasonality is necessary since the variance of the seasonality is increasing overtime. 
aus_hwplusdamp <- aus_gas %>%
  model(
    hw = ETS(Gas ~ error("M") + trend("A") + season("M")),
    hwdamped = ETS(Gas ~ error("M") + trend("Ad", phi = 0.9) + season("M")))%>%
  forecast(h = 36) %>%
  autoplot(aus_gas) +
  labs(title = "Thirty-Six Quarter Forecast of Australian Gas Production") +
  labs(subtitle = "Using Holt-Winters Multiplicative Method") 
aus_hwplusdamp

res_hw <- aus_gas %>%
  model(
    hw = ETS(Gas ~ error("M") + trend("A") + season("M")))
res_hwdamped <- aus_gas %>%
  model(
    hw = ETS(Gas ~ error("M") + trend("Ad", phi = 0.9) + season("M")))
residuals(res_hw)

## # A tsibble: 218 x 3 [1Q]
## # Key:       .model [1]
##    .model Quarter  .resid
##    <chr>    <qtr>   <dbl>
##  1 hw     1956 Q1  0.0181
##  2 hw     1956 Q2 -0.0958
##  3 hw     1956 Q3  0.0267
##  4 hw     1956 Q4  0.0901
##  5 hw     1957 Q1 -0.0387
##  6 hw     1957 Q2  0.0495
##  7 hw     1957 Q3 -0.0986
##  8 hw     1957 Q4  0.0358
##  9 hw     1958 Q1 -0.0350
## 10 hw     1958 Q2  0.0501
## # … with 208 more rows

residuals(res_hwdamped)

## # A tsibble: 218 x 3 [1Q]
## # Key:       .model [1]
##    .model Quarter  .resid
##    <chr>    <qtr>   <dbl>
##  1 hw     1956 Q1  0.0429
##  2 hw     1956 Q2 -0.0871
##  3 hw     1956 Q3  0.0237
##  4 hw     1956 Q4  0.0877
##  5 hw     1957 Q1 -0.0456
##  6 hw     1957 Q2  0.0483
##  7 hw     1957 Q3 -0.0925
##  8 hw     1957 Q4  0.0339
##  9 hw     1958 Q1 -0.0381
## 10 hw     1958 Q2  0.0577
## # … with 208 more rows

#The damped model appears to have reduced the size of most of the residuals which indicates a better fit for the data. Looking at the plot, it also appears to be the better forecast.

#Question 5

#Question 5 Part a
set.seed(105)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
myseries %>% autoplot(Turnover)

#Multiplicative seasonality is needed since the variance in the seasonality is not constant and is increasing over time. 

#Question 5 Part b
myseries_holtmult <- myseries %>% 
  model(
    hw = ETS(Turnover ~ error("M") + trend("A") + season("M")))%>%
  forecast(h = 36) %>%
  autoplot(myseries) +
  labs(title = "Three Year Forecast of Australian Retail Turnover") +
  labs(subtitle = "Using Holt-Winters Multiplicative Method") 
myseries_holtmult

myseries_damped <- myseries %>% 
  model(
    hwdamped = ETS(Turnover ~ error("M") + trend("Ad", phi = 0.9) + season("M"))) %>%
  forecast(h = 36) %>%
  autoplot(myseries) +
  labs(title = "Three Year Forecast of Australian Retail Turnover") +
  labs(subtitle = "Using Damped Holt-Winters Multiplicative Method") 
myseries_damped

#Question 5 Part c
myseries %>%
  stretch_tsibble(.init = 10) %>%
  model(
    hw = ETS(Turnover ~ error("M") + trend("A") + season("M")),
    hwdamped = ETS(Turnover ~ error("M") + trend("Ad", phi = 0.9) + season("M")))%>%
  forecast(h = 1) %>%
  accuracy(myseries)

## Warning: 8 errors (2 unique) encountered for hw
## [3] A seasonal ETS model cannot be used for this data.
## [5] Not enough data to estimate this ETS model.

## Warning: 8 errors (2 unique) encountered for hwdamped
## [3] A seasonal ETS model cannot be used for this data.
## [5] Not enough data to estimate this ETS model.

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 2019 Jan

## # A tibble: 2 × 12
##   .model   State  Indus…¹ .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>    <chr>  <chr>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 hw       Queen… Food r… Test   1.18  23.3  17.6 0.126  2.05 0.295 0.320 0.220 
## 2 hwdamped Queen… Food r… Test   4.10  23.3  17.4 0.471  2.00 0.292 0.320 0.0782
## # … with abbreviated variable name ¹Industry

#The RMSEs for the HW and the HW-damped are identical, which checks out since the plots looked very similar. I prefer the damped method since retail is declining in the face of online shopping. Turnover will likely decline with it and a good forecast should capture this. 

#Question 5 Part d
dampedcheck <- myseries %>%
  model(hw = ETS(Turnover ~ error("M") + trend("Ad", phi = 0.9) + season("M")))
gg_tsresiduals(dampedcheck)

#The ACF plot shows significant autocorrelation so these residuals do not resemble white noise.

#Question 5 Part e
myseries %>%
  filter(year(Month) < 2011) %>%
  model(
    snaive = SNAIVE(Turnover),
    hw = ETS(Turnover ~ error("M") + trend("A") + season("M")),
    hwdamped = ETS(Turnover ~ error("M") + trend("Ad", phi = 0.9) + season("M"))
  ) %>%
  forecast(h = "7 years") %>%
  accuracy(myseries)

## # A tibble: 3 × 12
##   .model   State   Indus…¹ .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>   <chr>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hw       Queens… Food r… Test  -102.  126.  105. -4.91  5.09  1.87  1.77 0.899
## 2 hwdamped Queens… Food r… Test   186.  226.  188.  8.81  8.96  3.35  3.17 0.898
## 3 snaive   Queens… Food r… Test   292.  321.  292. 14.2  14.2   5.19  4.51 0.911
## # … with abbreviated variable name ¹Industry

#The HW and HW damped both beat the SNAIVE since their RMSEs are lower.

#Question 6

#Question 6 Part a 
aus_trips <- tourism %>%
  summarise(Trips = sum(Trips))
aus_trips %>%
  autoplot(Trips)

#The data has a significant degree of seasonality. It appears to follow a very slight downward trend until around 2010 when it enters a strong positive growth trend. 

#Question 6 Part b
aus_stl <- aus_trips %>%
  model(STL(Trips)) %>%
  components()
aus_stl %>%
  as_tsibble() %>%
  autoplot(season_adjust)

#Question 6 Part c
stldamped <- decomposition_model(
  STL(Trips),
  ETS(season_adjust ~ error("A") + trend("Ad") + season("N"))
)
aus_trips %>%
  model(dcmp_AAdN = stldamped) %>%
  forecast(h = "2 years") %>%
  autoplot(aus_trips)

#Question 6 Part d
stlholt <- decomposition_model(
  STL(Trips),
  ETS(season_adjust ~ error("A") + trend("A") + season("N")))
aus_trips %>%
  model(dcmp_AAN = stlholt) %>%
  forecast(h = "2 years") %>%
  autoplot(aus_trips)

#Question 6 Part e
aus_trips %>%
  model(ets = ETS(Trips)) %>%
  forecast(h = "2 years") %>%
  autoplot(aus_trips)

#Question 6 Part f
fitcheck6 <- aus_trips %>%
  model(
    dcmp_AAdN = stldamped,
    dcmp_AAN = stlholt,
    ets = ETS(Trips)
  )
accuracy(fitcheck6)

## # A tibble: 3 × 10
##   .model    .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE     ACF1
##   <chr>     <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 dcmp_AAdN Training 103.   763.  576. 0.367  2.72 0.607 0.629 -0.0174 
## 2 dcmp_AAN  Training  99.7  763.  574. 0.359  2.71 0.604 0.628 -0.0182 
## 3 ets       Training 105.   794.  604. 0.379  2.86 0.636 0.653 -0.00151

#The RMSEs for the damped and HW are identical. The ETS has the worst fit. 

#Question 6 Part g
fitcheck6 %>%
  forecast(h = "2 years") %>%
  autoplot(aus_trips, level = NULL)

#The three forecasts are very similar. I would not pick the ETS due to the relatively high RMSE. Between the two AANs, I would pick the regular undamped HW since the number of trips will hit a "maximum" at some point. Australian air traffice can only sustain so many travellers.

#Question 6 Part h
residual6 <- fitcheck6 %>%
  select(dcmp_AAN)
augment(residual6) %>% 
  gg_tsdisplay(.resid, lag_max = 24, plot_type = "histogram")

## Warning: Removed 4 row(s) containing missing values (geom_path).

## Warning: Removed 4 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing non-finite values (stat_bin).

augment(residual6) %>%
  features(.innov, ljung_box, lag = 24, dof = 4)

## # A tibble: 1 × 3
##   .model   lb_stat lb_pvalue
##   <chr>      <dbl>     <dbl>
## 1 dcmp_AAN    33.3    0.0311

#The p-value of the LB test is <.05 so we can reject the null hypothesis and say that the residuals do not resemble white noise.

#Question 7

#Question 7 Part a
nz <- aus_arrivals %>% 
  filter(Origin == "NZ")
nz %>% 
  autoplot(Arrivals / 1e3) + labs(y = "Thousands of People")

#There is an obvious upward trend with a high degree of seasonality. It also appears that the variance of the seasonality is not constant. 

#Question 7 Part b
nz_train <- nz %>%
  slice(1:(n() - 8))
nz_train %>%
  model(ETS(Arrivals ~ error("M") + trend("A") + season("M"))) %>%
  forecast(h = "2 years") %>%
  autoplot() +
  autolayer(nz, Arrivals)

#Question 7 Part c
#Multiplicative seasonality is necessary since the variation in seasonality is not constant over time. 

#Question 7 Part d
finalstep <- anti_join(nz, nz_train,
                       by = c("Quarter", "Origin", "Arrivals"))
nz_fc <- nz_train %>%
  model(
    ets = ETS(Arrivals),
    log_ets = ETS(log(Arrivals)),
    snaive = SNAIVE(Arrivals),
    stl = decomposition_model(STL(log(Arrivals)), ETS(season_adjust)))

forecast7 <- nz_fc %>%
  forecast(h = "2 years")
forecast7 %>%
  autoplot(level = NULL) +
  autolayer(finalstep, Arrivals)

#Question 7 Part e
forecast7 %>% accuracy(nz)

## # A tibble: 4 × 11
##   .model  Origin .type      ME   RMSE    MAE    MPE  MAPE  MASE RMSSE    ACF1
##   <chr>   <chr>  <chr>   <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 ets     NZ     Test   -3495. 14913. 11421. -0.964  3.78 0.768 0.771 -0.0260
## 2 log_ets NZ     Test    2467. 13342. 11904.  1.03   4.03 0.800 0.689 -0.0786
## 3 snaive  NZ     Test    9709. 18051. 17156.  3.44   5.80 1.15  0.933 -0.239 
## 4 stl     NZ     Test  -12535. 22723. 16172. -4.02   5.23 1.09  1.17   0.109

checkres7 <- nz_fc %>%
  select('ets')
checkres7 %>%
  gg_tsresiduals()

#The log ets appears to have the most accurate forecast based on the RMSE scores. Based on the ACF, it appears that the residuals are not autocorrelated. However, the plot of the residuals does not appear to resemble white noise.

#Question 7 Part f
nz_cval <- nz %>%
  slice(1:(n() - 3)) %>%
  stretch_tsibble(.init = 36, .step = 3)
nz_cval %>%
  model(
    ets = ETS(Arrivals),
    log_ets = ETS(log(Arrivals)),
    snaive = SNAIVE(Arrivals),
    stl = decomposition_model(STL(log(Arrivals)), ETS(season_adjust))
  ) %>%
  forecast(h = 3) %>%
  accuracy(nz)

## # A tibble: 4 × 11
##   .model  Origin .type    ME   RMSE    MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>   <chr>  <chr> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ets     NZ     Test  4627. 15327. 11799.  2.23  6.45 0.793 0.797 0.283
## 2 log_ets NZ     Test  4388. 15047. 11566.  1.99  6.36 0.778 0.782 0.268
## 3 snaive  NZ     Test  8244. 18768. 14422.  3.83  7.76 0.970 0.976 0.566
## 4 stl     NZ     Test  4252. 15618. 11873.  2.04  6.25 0.798 0.812 0.244

#The log_ets has the lowest RMSE score and appears to be the best.

#Question 8

#Question 8 Part a
q8 <- aus_production %>%
  select(Quarter, Cement) %>%
  as_tsibble(index = Quarter)

autoplot(q8)

## Plot variable not specified, automatically selected `.vars = Cement`

#Question 8 Part b
thelightattheend <- q8 %>%
  stretch_tsibble(.init = 5, .step = 4)

lastvar <- thelightattheend %>%
  model(ETS(Cement),
        SNAIVE(Cement)) %>%
  forecast(h= "1 year")

reallast <- accuracy(lastvar, q8) %>%
  mutate(MSE = (RMSE)^2) %>%
  select(MSE, RMSE)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 3 observations are missing between 2010 Q3 and 2011 Q1

reallast

## # A tibble: 2 × 2
##      MSE  RMSE
##    <dbl> <dbl>
## 1  9962.  99.8
## 2 18028. 134.

#It appears that the ETS is the most accurate. This dataset is very long so ETS should work better for it.

Homework 5

Max Harley

2022-11-09