Homework 3

Questions

Question 1

# a. Australian Population (global_economy)
aus_population <- global_economy %>%
  filter(Country == "Australia") %>%
  select(-c(Code, GDP, Growth, CPI, Imports, Exports)) %>%
  mutate(Population = Population/1000000)

aus_population %>%
  model(RW(Population ~ drift())) %>%
  forecast(h = "5 years") %>%
  autoplot(aus_population) +
  geom_line(data = slice(aus_population, range(cumsum(!is.na(Population)))),
            aes(y = Population), linetype = "dashed", color = "blue") +
  labs(title = "Australian Population in Millions", y = "Population in Millions")

# b. Bricks (aus_production)

aus_bricks <- aus_production %>%
  filter_index("1956 Q1" ~ "2004 Q4") %>%
  select(-c(Beer, Tobacco, Cement, Electricity, Gas))

aus_bricks %>%
  model(SNAIVE(Bricks ~ lag("10 year"))) %>%
  forecast(h = "5 years") %>%
  autoplot(aus_bricks) +
  labs(title = "Australian Brick Production")

aus_bricks_stl <- aus_bricks %>%
  model(STL(Bricks ~ season(window = 4), robust = TRUE)) %>%
  components() %>% 
  autoplot() +
  labs(title = "STL decomposition: Australian Bricks")
aus_bricks_stl

# c. NSW Lambs (aus_livestock)

aus_lambs <- aus_livestock %>%
  filter(Animal == "Lambs" & State == "New South Wales") %>%
  mutate(Count = Count/1000)

aus_lambs_stl <- aus_lambs %>%
  model(STL(Count ~ season(window = 12), robust = TRUE)) %>%
  components() %>% 
  autoplot() +
  labs(title = "STL decomposition: NSW Lambs")
aus_lambs_stl

aus_lambs %>%
  model(SNAIVE(Count ~ lag("5 years"))) %>%
  forecast(h = "5 years") %>%
  autoplot(aus_lambs) +
  labs(title = "Australian Brick Production")

# d. Household wealth (hh_budget)

hh_wealth <- hh_budget %>%
  select(-c(Debt, DI, Expenditure, Savings, Unemployment))

hh_wealth %>%
  model(RW(Wealth ~ drift())) %>%
  forecast(h = "5 years") %>%
  autoplot(hh_wealth) +
  geom_line(data = slice(hh_wealth, range(cumsum(!is.na(Wealth)))),
            aes(y = Wealth), linetype = "dashed", color = "blue") +
  labs(title = "Australian Population in Millions", y = "Population in Millions")

## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

# e. Australian takeaway food turnover (aus_retail).

aus_takeaway <- aus_retail %>%
  filter(Industry == "Takeaway food services") %>%
  model(RW(Turnover ~ drift())) %>%
  forecast(h = "5 years") %>%
  autoplot(aus_retail) +
  geom_line(data = slice(aus_retail, range(cumsum(!is.na(Turnover)))),
            aes(y = Turnover), linetype = "dashed", color = "blue") +
  labs(title = "Australian Takeaway Food Turnover", y = "Takeaway Food Turnover") +
  facet_wrap(~State, scales = "free")
aus_takeaway

## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

Question 2

# a. Produce a time plot of the series.

fb <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(day = row_number()) %>%
  update_tsibble(index = day, regular = TRUE) %>%
  select(Date, Close)

fb %>%
  autoplot(Close) +
  labs(y = "Price($)", title = "Facebook Price Through Time", x = "Trading day")

# b + c. Produce forecasts using the drift method and plot them. Show that the forecasts are identical to extending the line drawn between the first and last observations.

fb %>%
  model(Drift = RW(Close ~ drift())) %>%
  forecast(h = 100) %>%
  autoplot(fb) +
  geom_line(data = slice(fb, range(cumsum(!is.na(Close)))),
   aes(y=Close), linetype = "dashed", color = "blue") +
  labs(title = "Drift Forecast for Facebook", y = "Price($)", x = "Trading day")

# d. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb %>%
  model(NAIVE(Close)) %>%
  forecast(h = 100) %>%
  autoplot(fb) +
  labs(title = "Naive Forecast for Facebook", x = "Trading day", y = "Price($)")

fb %>%
  model(MEAN(Close)) %>%
  forecast(h = 100) %>%
  autoplot(fb) +
  labs(title = "Naive Forecast for Facebook", x = "Trading day", y = "Price($)")

The drift method is better for forecasting Facebook’s stock price into the future because it includes the upward trend in price. The naive model fails to show this upward trend making it a lesser forecast. The mean model does not show a trend and fails to show the effect that the last closing price has as a basis for the future. The past closing prices should not effect our forecast as much as it does with the mean model.

Question 3

vic_livestock <- aus_livestock %>%
  filter(State == "Victoria")

vic_bull <- vic_livestock %>%
  filter(Animal == "Bulls, bullocks and steers")
vic_bull %>%
  model(SNAIVE(Count ~ lag("2 years"))) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_bull) +
  labs(title = "Victorian Bull Count")

vic_calves <- vic_livestock %>%
  filter(Animal == "Calves")
vic_calves %>%
  model(SNAIVE(Count ~ lag("5 years"))) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_calves) +
  labs(title = "Victorian Calf Count")

vic_cattle <- vic_livestock %>%
  filter(Animal == "Cattle (excl. calves)")
vic_cattle %>%
  model(SNAIVE(Count ~ lag("3 years"))) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_cattle) +
  labs(title = "Victorian Cattle Count")

vic_cows <- vic_livestock %>%
  filter(Animal == "Cows and heifers")
vic_cows %>%
  model(SNAIVE(Count ~ lag("5 years"))) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_cows) +
  labs(title = "Victorian Cow Count")

vic_lambs <- vic_livestock %>%
  filter(Animal == "Lambs")
vic_lambs %>%
  model(SNAIVE(Count ~ lag("5 years"))) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_lambs) +
  labs(title = "Victorian Lamb Count")

vic_pigs <- vic_livestock %>%
  filter(Animal == "Pigs")
vic_pigs %>%
  model(SNAIVE(Count ~ lag("5 years"))) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_pigs) +
  labs(title = "Victorian Pig Count")

vic_sheep <- vic_livestock %>%
  filter(Animal == "Sheep")
vic_sheep %>%
  model(SNAIVE(Count ~ lag("5 years"))) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_sheep) +
  labs(title = "Victorian Sheep Count")

I believe the seasonal naive forecast is not a reasonable benchmark for this dataset. While it looks like a nice forecast, it fails to account for any trend in the data. In addition, there is very unlikely that the seasonality will continue on exactly like it has in the previous periods for most of the data. I would use the mean method for sheep, cattle, pig, bull, and cow data. I beleieve the only data that the seasonal naive method works well for is calve count. Lastly, I would use the drift method for lambs.

Question 4

Good forecast methods should have normally distributed residuals. True. A good forecast should have normally distributed residuals that also follow “white noise” and have little to no autocorrelation.
A model with small residuals will give good forecasts. False. While minimizing the sum of the squared residuals helps us to create the best model, there is no guarentee that this will give us a “good” forecast. The goal should be to minimize the RSS, but you can never be certain what will happen in the future.
The best measure of forecast accuracy is MAPE. False. RMSE is the best measure of forecast accuracy as it is more sensitive to outliers. This is because outliers will produce the larges errors under RMSE therefore we can more accurately measure the forecast accuracy
If your model does not forecast well, you should make it more complicated. False. Just because your model does not forecast well does not mean you need to make it more complicated. There are many possible ways to improve your forecast. You could have an omitted variable or poor dependent variables that may result in a bad forecast. These can be fixed by changing your dependent variables, but this does not always make it more complicated.
Always choose the model with the best forecast accuracy as measured on the test set. True. Using time series cross-validation we can create a training and test set to see how good our forecasts are. We are able to create our model using the training dataset then see how well it forecasts on the test set. Therefore you should choose the model with the highest accuracy in forecasting your test set.

Question 5

set.seed(87654321)
myaus <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

# a. Create a training dataset consisting of observations before 2011.

train_myaus <- myaus %>%
  filter(year(Month) < 2011)

# b. Check that your data have been split appropriately by producing the following plot.

autoplot(myaus, Turnover) +
  autolayer(train_myaus, Turnover, color = "blue")

# c. Calculate seasonal naive forecasts using SNAIVE() applied to your training data (myseries_train).

myaus_fit <- train_myaus %>%
  model(SNAIVE(Turnover))

# d. Check the residuals. Do the residuals appear to be uncorrelated and normally distributed?

myaus_fit %>% 
  gg_tsresiduals()

## Warning: Removed 12 row(s) containing missing values (geom_path).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing non-finite values (stat_bin).

The residuals are correlated up to lag 7. The residuals are also not normally distributed. The dirstribution of residuals is skewed right based off the graph above.

# e. Produce forecasts for the test data.

myaus_forecast <- myaus_fit %>%
  forecast(new_data = anti_join(myaus, train_myaus))

## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")

myaus_forecast %>% 
  autoplot(myaus)

# f. Compare the accuracy of your forecasts against the actual values.

myaus_fit %>% 
  accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Footwea… SNAIV… Trai…  5.12  10.2  7.33  5.99  8.82     1     1 0.681

myaus_forecast %>% 
  accuracy(myaus)

## # A tibble: 1 × 12
##   .model     State Indus…¹ .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>      <chr> <chr>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Tu… Vict… Footwe… Test  -1.47  20.4  16.0 -2.37  9.54  2.19  1.99 0.646
## # … with abbreviated variable name ¹Industry

How sensitive are the accuracy measures to the amount of training data used? (Challenge) Accuracy measures are sensative to the amount of training data used. The more training data used, the more accurate the model can be, but there will also be less data to test the accuracy on. This trade off gives you less reliable accuracy measures when using more and more training data from the entire dataset.

Question 6

# a. Produce some plots of the data in order to become familiar with it.

nsw_livestock <- aus_livestock %>%
  filter(Animal == "Pigs", State == "New South Wales")

nsw_livestock %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = Count`

nsw_livestock_stl <- nsw_livestock %>%
  model(STL(Count ~ season(window = 12), robust=TRUE)) %>%
  components() %>% 
  autoplot() +
  labs(title = "STL Decomposition: NSW Pigs Count")
nsw_livestock_stl

nsw_livestock %>%
  gg_season()

## Plot variable not specified, automatically selected `y = Count`

nsw_livestock %>%
  gg_subseries()

## Plot variable not specified, automatically selected `y = Count`

# b. Create a training set of 486 observations

nswtrain <- nsw_livestock %>%
  filter(year(Month) < 2013)

# c. Try using various benchmark methods to forecast the training set and compare the results on the test set. Which method did best?

nswtrain_fit <- nswtrain %>%
  model(
    mean = MEAN(Count),
    Naive = NAIVE(Count),
    Seasonal_Naive = SNAIVE(Count),
    Drift = RW(Count ~ drift())
  ) 
nswtrain_fc <- nswtrain_fit %>%
  forecast(h = 12) %>%
  autoplot(nswtrain, level = NULL) +
  labs(title = "Forecasts for quarterly pig count") +
  guides(color = guide_legend(title = "Forecast"))
nswtrain_fc

accuracy(nswtrain_fit)

## # A tibble: 4 × 12
##   Animal State     .model .type        ME   RMSE    MAE    MPE  MAPE  MASE RMSSE
##   <fct>  <fct>     <chr>  <chr>     <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Pigs   New Sout… mean   Trai…  2.42e-12 25389. 21496. -5.59   20.8  2.03  1.75
## 2 Pigs   New Sout… Naive  Trai… -3.98e+ 1 15324. 12171. -1.02   11.2  1.15  1.05
## 3 Pigs   New Sout… Seaso… Trai… -8.23e+ 2 14530. 10600  -1.83   10.1  1     1   
## 4 Pigs   New Sout… Drift  Trai…  3.01e-12 15324. 12173. -0.985  11.2  1.15  1.05
## # … with 1 more variable: ACF1 <dbl>
## # ℹ Use `colnames()` to see all variable names

The seasonal naive forecast was the most accurate as seen in accuracy(nswtrain_fit).

# d. Check the residuals of your preferred method. Do they resemble white noise?

nswtrain_snaive <- nswtrain %>%
  model(Seasonal_Naive = SNAIVE(Count)) 

nswtrain_snaive %>%
  gg_tsresiduals()

## Warning: Removed 12 row(s) containing missing values (geom_path).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing non-finite values (stat_bin).

augment(nswtrain_snaive) %>%
  features(.resid, ljung_box, lag = 8, dof = 0)

## # A tibble: 1 × 5
##   Animal State           .model         lb_stat lb_pvalue
##   <fct>  <fct>           <chr>            <dbl>     <dbl>
## 1 Pigs   New South Wales Seasonal_Naive    988.         0

I believe the residuals do not resemble white noise. Using the gg_tsresiduals() function I was able to see a high amount of autocorrelation from lags 1 to 11 and a mean above 0. In addition, when I used the ljung box test the p-value showed as 0.0. This p-value tells me that our residuals are distinguishable from a white noise series.

Question 7

# a. Use an STL decomposition to calculate the trend-cycle and seasonal indices for additive and multiplicative. (Experiment with having fixed (periodic) or changing seasonality.)

aus_clay56to05 <- aus_production %>%
  filter(year(Quarter) < 2005) %>%
  select(-c(Beer, Tobacco, Cement, Electricity, Gas))

aus_clayadd <- aus_clay56to05 %>%
  model(classical_decomposition(Bricks, type = "additive")) %>%
  components() %>%
  autoplot() +
  labs(title = "Classical additive decomposition of Australian clay production")
aus_clayadd

## Warning: Removed 2 row(s) containing missing values (geom_path).

aus_claymulti <- aus_clay56to05 %>%
  model(classical_decomposition(Bricks, type = "multiplicative")) %>%
  components() %>%
  autoplot() +
  labs(title = "Classical multiplicative decomposition of Australian clay production")
aus_claymulti

## Warning: Removed 2 row(s) containing missing values (geom_path).

aus_claystlfix <- aus_clay56to05 %>%
  model(STL(Bricks ~ season(window = 4), robust = TRUE)) %>%
  components() %>% 
  autoplot() +
  labs(title = "STL decomposition: Australian Bricks")
aus_claystlfix

aus_claystlperiod <- aus_clay56to05 %>%
  model(STL(Bricks ~ season(window = "periodic"), robust = TRUE)) %>%
  components() %>% 
  autoplot() +
  labs(title = "STL decomposition: Australian Bricks")
aus_claystlperiod

# b) Compute and plot the seasonally adjusted data.

aus_claydcmp <- aus_clay56to05 %>%
  model(stl = STL(Bricks))
aus_cmp <- components(aus_claydcmp)

aus_claysa <- aus_clay56to05 %>%
  autoplot(Bricks, color = 'gray') +
  autolayer(components(aus_claydcmp), season_adjust, color='red') +
  labs(y = "Clay Brick Production", title = "Clay Brick Production overtime")
aus_claysa

# c) Use a naive method to produce forecasts of the seasonally adjusted data.

aus_trend <- aus_cmp %>%
  select(-c(.model, Bricks, trend, season_year, remainder)) 
  
aus_trend %>%
  model(NAIVE(season_adjust)) %>%
  forecast(h = "5 years") %>%
  autoplot(aus_trend) +
  labs(title = "Seasonally adjusted naive forecast", y = "Bricks")

# d) Use decomposition_model() to reseasonalise the results, giving forecasts for the original data.

aus_claydcmp2 <- aus_clay56to05 %>%
  model(stlf = decomposition_model(
    STL(Bricks ~ trend(window = 4), robust = TRUE), NAIVE(season_adjust)
  ))
aus_claydcmp2 %>%
  forecast() %>%
  autoplot(aus_clay56to05) +
  labs(title = "Forecast with decomposition_model()")

# e) Do the residuals look uncorrelated?

gg_tsresiduals(aus_claydcmp2)

## Warning: Removed 4 row(s) containing missing values (geom_path).

## Warning: Removed 4 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing non-finite values (stat_bin).

augment(aus_claydcmp2) %>%
  features(.resid, ljung_box, lag = 8, dof = 0)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 stlf      27.0  0.000704

The residuals are correlated. This is shown via the gg_tsresiduals() function. The residuals are autocorrelated at lags 1, 5, 7, 8, and 20. This is backed up by the ljung box test where our p-value is 0.000704. This p-value tells me that our residuals are distinguishable from a white noise series.

# f. Compare forecasts from decomposition_model() with those from SNAIVE(), using a test set comprising the last 2 years of data. Which is better?

aus_prod <- aus_production
aus_clayp04to05 <- aus_production %>%
  filter(year(Quarter) > 2003) %>%
  select(-c(Beer, Tobacco, Cement, Electricity, Gas))

aus_clayomit <- na.omit(aus_clayp04to05)

aus_clay04to05_snaive <- aus_clayomit %>%
  model(SNAIVE(Bricks ~ lag("1 year"))) %>%
  forecast(h = "2 years") %>%
  autoplot(aus_clayomit) +
  labs(title = "Forecast with SNAIVE since 2004")
aus_clay04to05_snaive

aus_claydcmp3 <- aus_clayomit %>%
  model(stlf = decomposition_model(
    STL(Bricks ~ trend(window = 4), robust = TRUE), NAIVE(season_adjust)
  ))
aus_claydcmp3 %>%
  forecast() %>%
  autoplot(aus_clayomit) +
  labs(title = "Forecast with decomposition_model() since 2004")

The forecast with decomposition_model() is better because it accounts for future seasons being significantly different than past seasons. The SNAIVE forecast uses a basis of the following seasons being very similar to the ones in the past with only the remainder differing. It is important to point out that the decomosition_model() includes a trend component as well.

Question 8

# a. Extract data from the Gold Coast region using filter() and aggregate total overnight trips (sum over Purpose) using summarise(). Call this new dataset gc_tourism.

tourism <- as_tibble(tourism)
gc_tourism <- tourism %>%
  filter(Region == "Gold Coast") %>%
  group_by(Quarter) %>%
  summarise(Total_Trips = sum(Trips)) 
gc_tourism

## # A tibble: 80 × 2
##    Quarter Total_Trips
##      <qtr>       <dbl>
##  1 1998 Q1        827.
##  2 1998 Q2        681.
##  3 1998 Q3        839.
##  4 1998 Q4        820.
##  5 1999 Q1        987.
##  6 1999 Q2        751.
##  7 1999 Q3        822.
##  8 1999 Q4        914.
##  9 2000 Q1        871.
## 10 2000 Q2        780.
## # … with 70 more rows
## # ℹ Use `print(n = ...)` to see more rows

# b. Using slice() or filter(), create three training sets for this data excluding the last 1, 2 and 3 years. For example, gc_train_1 <- gc_tourism %>% slice(1:(n()-4)).

gc_tourism_ts <- gc_tourism %>%
  as_tsibble(index = Quarter)

gc_train_1 <- gc_tourism_ts %>%
  filter(year(Quarter) < 2017)

gc_train_2 <- gc_tourism_ts %>%
  filter(year(Quarter) < 2016)

gc_train_3 <- gc_tourism_ts %>%
  filter(year(Quarter) < 2015)

# c. Compute one year of forecasts for each training set using the seasonal naÃ¯ve (SNAIVE()) method. Call these gc_fc_1, gc_fc_2 and gc_fc_3, respectively.

gc_fc_1 <- gc_train_1 %>%
  model(SNAIVE(Total_Trips ~ lag("year"))) %>%
  forecast(h = "1 year") 
gc_fc_1 %>%
  autoplot(gc_train_1) +
  labs(title = "Train Forecast Without Last Year", y = "Total Trips", x = "Time")

gc_fc_2 <- gc_train_2 %>%
  model(SNAIVE(Total_Trips ~ lag("year"))) %>%
  forecast(h = "1 year")
gc_fc_2 %>%
  autoplot(gc_train_2) +
  labs(title = "Train Forecast Without Last Two Years", y = "Total Trips", x = "Time")

gc_fc_3 <- gc_train_3 %>%
  model(SNAIVE(Total_Trips ~ lag("year"))) %>%
  forecast(h = "1 year")
gc_fc_3 %>%
  autoplot(gc_train_3) +
  labs(title = "Train Forecast Without Last Three Years", y = "Total Trips", x = "Time")

# d. Use accuracy() to compare the test set forecast accuracy using MAPE. Comment on these.
gc_fc_1 %>%
  accuracy(gc_tourism_ts)

## # A tibble: 1 × 10
##   .model                  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 "SNAIVE(Total_Trips ~ … Test   75.1  167.  154.  6.36  15.1  2.66  2.36 -0.410

gc_fc_2 %>%
  accuracy(gc_tourism_ts)

## # A tibble: 1 × 10
##   .model                  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 "SNAIVE(Total_Trips ~ … Test   12.0  43.1  39.5  1.14  4.32 0.670 0.599 -0.792

gc_fc_3 %>%
  accuracy(gc_tourism_ts)

## # A tibble: 1 × 10
##   .model                   .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Total_Trips ~ l… Test   35.8  91.4  83.9  3.56  9.07  1.46  1.30 0.239

Using accuracy() for gc_fc 1, 2, and 3 we can see that the gc_fc_2 has the lowest MAPE. By having the lowest MAPE gc_fc_2 is the most accurate forecast of the three attempts. MAPE calculates the percentage your forecast is off from the test data and a lower MAPE is better.

Question 9

# a. From the gafa_stock select apple stock close pricings and plot it
apple <- gafa_stock %>%
  filter(Symbol == "AAPL") %>%
  mutate(trading_day = row_number()) %>%
  update_tsibble(index = trading_day, regular = TRUE) %>%
  select(-c(Open, High, Low, Adj_Close, Volume))
autoplot(apple) +
  labs(title = "Apple stock price throughout time", x = "Trading days", y = "Close price($)")

## Plot variable not specified, automatically selected `.vars = Close`

# b. apply the cross-validation with a minimum length of 10, growing by 1 each step. (creates test subsets)

apple_stch <- apple %>%
  stretch_tsibble(.init = 10, .step = 1) %>%
  relocate(Date, Symbol, .id)

# c. Estimate a Random walk with drift model

apple_rw <- apple_stch %>%
  model(RW(Close ~ drift())) %>%
  forecast(h=1) %>%
  accuracy(apple)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 1259

# d. repeat step c by using a mean model

apple_mean <- apple_stch %>%
  model(MEAN(Close)) %>%
  forecast(h=1) %>%
  accuracy(apple)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 1259

# e. Compare both models’ accuracy. Which model performed the best?

apple_rw

## # A tibble: 1 × 11
##   .model       Symbol .type      ME  RMSE   MAE     MPE  MAPE  MASE RMSSE   ACF1
##   <chr>        <chr>  <chr>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 RW(Close ~ … AAPL   Test  -0.0158  2.10  1.41 -0.0139  1.06  1.00  1.00 0.0330

apple_mean

## # A tibble: 1 × 11
##   .model      Symbol .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>       <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MEAN(Close) AAPL   Test   26.9  37.6  28.5  16.9  18.6  20.2  17.9 0.996

Using the random-walk with drift forecast has a lower RMSE than the mean forecast. Therefore the random-walk with drift forecast performed better than the mean forecast. Because the RMSE measures the root of the mean residuals squared the random-walk with drift is superior and more robust to outliers.

Question 10

# a. From the data set prices plot the wheat (do not forget to remove NA)

wheat1 <- prices %>%
  select(-c(eggs, chicken, copper, nails, oil))

wheat <- na.omit(wheat1)

autoplot(wheat) +
  labs(title = "Wheat production through time", y = "Wheat production", x = "Year")

## Plot variable not specified, automatically selected `.vars = wheat`

# b. Fit a RW with drift, and forecast for the next 50 periods (plot)

wheat %>%
  model(RW(wheat ~ drift())) %>%
  forecast(h = 50) %>%
  autoplot(wheat) +
  labs(title = "RW with Drift Forecast of Wheat", y = "Wheat Production", x = "Year")

# c. Let’s see that you do not trust on the predict interval estimated. In doing so, please bootstrap 500 scenarios

wheat_bs <- wheat %>%
  model(NAIVE(wheat))
wheat_sim <- wheat_bs %>%
  generate(h = 50, times = 500, bootstrap = TRUE)
wheat_sim

## # A tsibble: 25,000 x 4 [1Y]
## # Key:       .model, .rep [500]
##    .model        year .rep   .sim
##    <chr>        <dbl> <chr> <dbl>
##  1 NAIVE(wheat)  1997 1     124. 
##  2 NAIVE(wheat)  1998 1      98.5
##  3 NAIVE(wheat)  1999 1     107. 
##  4 NAIVE(wheat)  2000 1      95.0
##  5 NAIVE(wheat)  2001 1     161. 
##  6 NAIVE(wheat)  2002 1      56.5
##  7 NAIVE(wheat)  2003 1      68.5
##  8 NAIVE(wheat)  2004 1     152. 
##  9 NAIVE(wheat)  2005 1     150. 
## 10 NAIVE(wheat)  2006 1     327. 
## # … with 24,990 more rows
## # ℹ Use `print(n = ...)` to see more rows

# d. plot all scenarios

wheat %>%
  ggplot(aes(x = year)) +
  geom_line(aes(y = wheat)) +
  geom_line(aes(y = .sim, color = as.factor(.rep)), data = wheat_sim) +
  guides(color = "none")

wheat_pi <- wheat_bs %>%
  forecast(h = 50, bootstrap = TRUE, times = 500)

autoplot(wheat_pi, wheat)

What are the conditions in which predict intervals from bootstrapped residuals might be reasonable?

Prediction intervals from bootstrapped residuals are reasonable when a normal distribution is illogical. Bootstrapping is especially strong when you can assume that future errors will be similar to past errors.