Homework

library(tidyquant)
library(tidyverse)
library(fpp3)
library(moments)
library(tsibble)
library(tsibbledata)
library(ggfortify)
library(ggplot2)
library(readxl)
library(dplyr)
library(fabletools)
library(fable)
library(slider)
library(zoo)
library(latex2exp)
library(seasonal)
library(lubridate)

#PROBLEM 1

aus_pop <- global_economy %>%
  filter(Country == "Australia") %>%
  select("Population")

aus_pop %>%
  model(RW(Population ~ drift())) %>%
  forecast(h = "5 years") %>%
  autoplot(aus_pop) +
  geom_line(data = slice(aus_pop, range(cumsum(!is.na(Population)))),
            aes(y=Population), linetype = "dashed", colour = "#0072B2") +
  labs(title = "Australian Population DRIFT Forecast")

#Chose DRIFT because data does not exhibit a random walk or seasonality.

aus_bricks <- aus_production %>%
  filter_index("1956 Q1" ~ "2004 Q4") %>%
  select(Bricks)

aus_bricks %>%
  model(SNAIVE(Bricks)) %>%
  forecast(h=5) %>%
  autoplot(aus_bricks) +
  labs(title = "SNAIVE Forecast of Australian Brick Production")

#Tried NIAVE, SNAIVE, and DRIFT models for the brick forecast and all of them gave me a huge prediction interval. #I also tried reducing the forecast to two years into the future, but the PI was still too large. #SNAIVE method was chosen because the data exhibits seasonality

nsw_lambs <- aus_livestock %>%
  filter(State == "New South Wales", Animal == "Lambs")
nsw_lambs %>%
  model(SNAIVE(Count)) %>%
  forecast(h=5) %>%
  autoplot(nsw_lambs) +
  labs(title = "SNAIVE Forecast of Lambs slaughtered in South New Wales")

#Data appears to have an inconsistent seasonal model with a random walk, so DRIFTwas chosen. All models gave a huge prediction interval.

wealth <- hh_budget %>%
  select(Wealth)
wealth %>%
  model(NAIVE(Wealth)) %>%
  forecast(h = "5 years") %>%
  autoplot(wealth) +
  labs(title = "NAIVE Forecast of Household Wealth in Australia, Canada, Japan, and the USA")

#Chose NAIVE because it fits best with economic/fiancial data.

takeaway_turnover <- aus_retail %>%
  filter(Industry == 'Takeaway food services', State == 'Australian Capital Territory') %>%
  select(Turnover)
takeaway_turnover %>%
  model(RW(Turnover ~ drift())) %>%
  forecast(h=5) %>%
  autoplot(takeaway_turnover) +
  geom_line(data = slice(takeaway_turnover, range(cumsum(!is.na(Turnover)))),
          aes(y=Turnover), linetype = "dashed", colour = "#0072B2")

  labs(title = "DRIFT Forecast of Takeaway Food Services Turnover in the Australian Capital Territory")

## $title
## [1] "DRIFT Forecast of Takeaway Food Services Turnover in the Australian Capital Territory"
## 
## attr(,"class")
## [1] "labels"

#Drift gave me the narrowest prediction interval, so it was chosen.

#PROBLEM 2

facebook_stock <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(day = row_number()) %>%
  update_tsibble(index = day, regular = TRUE)
autoplot(facebook_stock, Close) +
  labs(title = "Closing Facebook Stock Prices ($USD) from January 2, 2014 to December 31, 2018")

facebook_stock %>%
  model(RW(Close ~ drift())) %>%
  forecast(h=30) %>%
  autoplot(facebook_stock) +
  labs(title = "DRIFT Forecast of Closing Facebook Stock Prices ($USD)")

facebook_stock %>%
  model(RW(Close ~ drift())) %>%
  forecast(h=30) %>%
  autoplot(facebook_stock) +
  geom_segment(aes(x = 1, y = 54.71, xend = 1258, yend = 131.09)) +
  labs(title = "DRIFT Forecast of Closing Facebook Stock Prices ($USD)")

facebook_stock %>%
  model(NAIVE(Close)) %>%
  forecast(h=30) %>%
  autoplot(facebook_stock) +
  labs(title = "NAIVE Forecast of Closing Facebook Stock Prices ($USD)")

#I think that the NAIVE method is best for this data. NAIVE is best used when forecasting Random Walk Data, which we are using because this is stock data, which is almost always Random Walk Data.

#PROBLEM 3

vic_livestock <- aus_livestock %>%
  filter(State == "Victoria")

vic_bbs <- vic_livestock %>%
  filter(Animal == "Bulls, bullocks and steers")
vic_bbs %>%
  model(SNAIVE(Count)) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_bbs) +
  labs(title = "SNAIVE Forecast of the Slaughter of Bulls, Bullocks, and Steers in Victoria")

vic_sheep <- vic_livestock %>%
  filter(Animal == "Sheep")
vic_sheep %>%
  model(SNAIVE(Count)) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_sheep) +
  labs(title = "SNAIVE Forecast of the Slaughter of Sheep in Victoria")

vic_lambs <- vic_livestock %>%
  filter(Animal == "Lambs")
vic_lambs %>%
  model(SNAIVE(Count)) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_lambs) +
  labs(title = "SNAIVE Forecast of the Slaughter of Lambs in Victoria")

vic_pigs <- vic_livestock %>%
  filter(Animal == "Pigs")
vic_pigs %>%
  model(SNAIVE(Count)) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_pigs) +
  labs(title = "SNAIVE Forecast of the Slaughter of Pigs in Victoria")

vic_ch <- vic_livestock %>%
  filter(Animal == "Cows and heifers")
vic_ch %>%
  model(SNAIVE(Count)) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_ch) +
  labs(title = "SNAIVE Forecast of the Slaughter of Cows and Heifers in Victoria")

vic_cattle <- vic_livestock %>%
  filter(Animal == "Cattle (excl. calves)")
vic_cattle %>%
  model(SNAIVE(Count)) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_cattle) +
  labs(title = "SNAIVE Forecast of the Slaughter of Cattle (Excl. Calves) in Victoria")

vic_calves <- vic_livestock %>%
  filter(Animal == "Calves")
vic_calves %>%
  model(SNAIVE(Count)) %>%
  forecast(h = "5 years") %>%
  autoplot(vic_calves) +
  labs(title = "SNAIVE Forecast of the Slaughter of Calves in Victoria")

#If forced to choose, I would probably use the SNAIVE method to forecast these variables. #There is a seasonal component to these variables that I think SNAIVE accounts for decently. #The large prediction intervals tell me that the model could be improved.

#PROBLEM 4

#A: True: Having normally distributed residuals are a good (but not necessarily required) quality of a reliable forecasting model. Residuals without a normal distribution means that your model has bias that is not accounted for.

#B: False: A model with too small residuals may be over-fitted to the training data.

#C: False: There are multiple different ways to measure a forecast's accuracy depending on what is being forecasted.

#D: False: Instead of over-complicating a model, a different model could be used instead.

#E: True: Models with the best test set accuracy will give you the closest forecast estimates to the actual number.

#PROBLEM 5

set.seed(987654321)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_trained <- myseries %>%
  filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
  autolayer(myseries_trained, Turnover, colour = "red")

myseries_fitted <- myseries_trained %>%
  model(SNAIVE(Turnover))

myseries_fitted %>% gg_tsresiduals()

## Warning: Removed 12 row(s) containing missing values (geom_path).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing non-finite values (stat_bin).

myseries_forecast <- myseries_fitted %>%
  forecast(new_data = anti_join(myseries, myseries_trained))

## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")

myseries_forecast %>% autoplot(myseries)

myseries_fitted %>% accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Other r… SNAIV… Trai…  2.75  7.21  5.51  5.52  9.10     1     1 0.656

myseries_forecast %>% accuracy(myseries)

## # A tibble: 1 × 12
##   .model     State Indus…¹ .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>      <chr> <chr>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Tu… Vict… Other … Test  -7.92  13.0  9.84 -10.3  12.7  1.78  1.80 0.491
## # … with abbreviated variable name ¹Industry

#PROBLEM 6

nsw_pigs <- aus_livestock %>%
  filter(State == "New South Wales", Animal == "Pigs")

autoplot(nsw_pigs) +
  labs(title = "Slaughter of Pigs in New South Wales")

## Plot variable not specified, automatically selected `.vars = Count`

gg_season(nsw_pigs) +
  labs(title = "Slaughter of Pigs in New South Wales")

## Plot variable not specified, automatically selected `y = Count`

gg_subseries(nsw_pigs) +
  labs(title = "Slaughter of Pigs in New South Wales")

## Plot variable not specified, automatically selected `y = Count`

nsw_trained_pigs <- nsw_pigs %>%
  filter(year(Month) < 2013)

nsw_tested_pigs <- nsw_pigs %>%
  filter(year(Month) >= 2013)

nsw_fitted_pigs <- nsw_trained_pigs %>%
  model(
    Naive = NAIVE(Count),
    `Seasonal Naive` = SNAIVE(Count),
    Drift = RW(Count ~ drift()))


nsw_forcast_pigs <- nsw_fitted_pigs %>%
  forecast(h = 72)
nsw_forcast_pigs %>%
  autoplot(nsw_pigs %>% filter(year(Month) >=2010))

#SNAIVE appears to be the best model for this data. It appears to capture the seasonal component of the data while also not being over-fit.

nsw_trained_pigs %>%
  model(Drift = RW(Count ~ drift())) %>%
  gg_tsresiduals()

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_bin).

nsw_trained_pigs %>%
  model(SNAIVE(Count)) %>%
  gg_tsresiduals()

## Warning: Removed 12 row(s) containing missing values (geom_path).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing non-finite values (stat_bin).

nsw_trained_pigs %>%
  model(NAIVE(Count)) %>%
  gg_tsresiduals()

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_bin).

#This data does not appear to be white noise. All models show that the data has many spikes that go outside the critical value, meaning it isn’t white noise.

#PROBLEM 7

aus_brick_stl <-
  aus_production %>%
  na.omit() %>% 
  model(
    `STL Default` = STL(Bricks),
    `STL 1 Month` = STL(Bricks ~ season(period= '1 month')),
    `STL Periodic` = STL(Bricks ~ season(window = 'periodic')))

aus_brick_stl %>% 
  components() %>% 
  autoplot() +
  theme(legend.title = element_text('STL Model')) +
  labs(title = 'Bricks - STL Decomposition')

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

aus_bricks_naive <-
  aus_production %>% 
  filter_index(. ~ '1996') %>% 
  model(stl = STL(Bricks)) %>% 
  components() %>% 
  select(-.model)
aus_bricks_naive %>% 
  model(NAIVE(season_adjust)) %>% 
  forecast() %>% 
  autoplot(aus_bricks_naive)

aus_brick_trained <-
  aus_production %>% 
  filter_index(. ~ '1996')
aus_brick_trained %>% 
  model(
    d = decomposition_model(
      STL(Bricks ~ season(window = 'periodic')),
      NAIVE(season_adjust)
    )
  ) %>% 
  forecast() %>% 
  autoplot(aus_brick_trained)

brick_tested <-
  aus_production %>% 
  na.omit() %>% 
  filter_index('2002' ~ .)

#PROBLEM 8

gc_tourism <- tourism %>%
  filter(Region == "Gold Coast") %>%
  summarise(Total_Trips = sum(Trips))
gc_tourism

## # A tsibble: 80 x 2 [1Q]
##    Quarter Total_Trips
##      <qtr>       <dbl>
##  1 1998 Q1        827.
##  2 1998 Q2        681.
##  3 1998 Q3        839.
##  4 1998 Q4        820.
##  5 1999 Q1        987.
##  6 1999 Q2        751.
##  7 1999 Q3        822.
##  8 1999 Q4        914.
##  9 2000 Q1        871.
## 10 2000 Q2        780.
## # … with 70 more rows
## # ℹ Use `print(n = ...)` to see more rows

gc_tourism_ts <- gc_tourism %>%
  as_tsibble(index = Quarter)

gc_train_1 <- gc_tourism_ts %>% slice(1:(n()-4))
gc_train_2 <- gc_tourism_ts %>% slice(1:(n()-8))
gc_train_3 <- gc_tourism_ts %>% slice(1:(n()-12))

#Question 8 Part c
gc_fc_1 <- gc_train_1 %>%
  model(SNAIVE(Total_Trips ~ lag("year"))) %>%
  forecast(h = "1 year") 
gc_fc_1 %>%
  autoplot(gc_train_1) +
  labs(title = "Train Forecast -1 Year", y = "Total Trips", x = "Time")

gc_fc_2 <- gc_train_2 %>%
  model(SNAIVE(Total_Trips ~ lag("year"))) %>%
  forecast(h = "1 year")
gc_fc_2 %>%
  autoplot(gc_train_2) +
  labs(title = "Train Forecast -2 Years", y = "Total Trips", x = "Time")

gc_fc_3 <- gc_train_3 %>%
  model(SNAIVE(Total_Trips ~ lag("year"))) %>%
  forecast(h = "1 year")
gc_fc_3 %>%
  autoplot(gc_train_3) +
  labs(title = "Train Forecast -3 Years", y = "Total Trips", x = "Time")

gc_fc_1 %>%
  accuracy(gc_tourism_ts)

## # A tibble: 1 × 10
##   .model                  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 "SNAIVE(Total_Trips ~ … Test   75.1  167.  154.  6.36  15.1  2.66  2.36 -0.410

gc_fc_2 %>%
  accuracy(gc_tourism_ts)

## # A tibble: 1 × 10
##   .model                  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 "SNAIVE(Total_Trips ~ … Test   12.0  43.1  39.5  1.14  4.32 0.670 0.599 -0.792

gc_fc_3 %>%
  accuracy(gc_tourism_ts)

## # A tibble: 1 × 10
##   .model                   .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Total_Trips ~ l… Test   35.8  91.4  83.9  3.56  9.07  1.46  1.30 0.239

Because gc_fc_2 has the lowest MAPE, it is the most reliable of the three forecasts.

#Problem 9

apple_stock <- gafa_stock %>%
  filter(Symbol == "AAPL") %>%
  mutate(trading_day = row_number()) %>%
  update_tsibble(index = trading_day, regular = TRUE) %>%
  select(-c(Open, High, Low, Adj_Close, Volume))
autoplot(apple_stock) +
  labs(title = "Apple Closing Stock Prices")

## Plot variable not specified, automatically selected `.vars = Close`

apple_crossval <- apple_stock %>%
  stretch_tsibble(.init = 10, .step = 1) %>%
  relocate(Date, Symbol, .id)


apple_walk <- apple_crossval %>%
  model(RW(Close ~ drift())) %>%
  forecast(h=1) %>%
  accuracy(apple_stock)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 1259

apple_mean <- apple_crossval %>%
  model(MEAN(Close)) %>%
  forecast(h=1) %>%
  accuracy(apple_stock)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 1259

#PROBLEM 10

wheat<- prices %>%
  select(-c(eggs, chicken, copper, nails, oil))

wheat_clean <- na.omit(wheat)

autoplot(wheat_clean) +
  labs(title = "Wheat Production", y = "Volume", x = "Year")

## Plot variable not specified, automatically selected `.vars = wheat`

wheat_clean %>%
  model(RW(wheat ~ drift())) %>%
  forecast(h = 50) %>%
  autoplot(wheat_clean) +
  labs(title = "Wheat Forecast RW", y = "Volume", x = "Year")

wheat_naive <- wheat_clean %>%
  model(NAIVE(wheat))
wheat_boot <- wheat_naive %>%
  generate(h = 50, times = 500, bootstrap = TRUE)
wheat_boot

## # A tsibble: 25,000 x 4 [1Y]
## # Key:       .model, .rep [500]
##    .model        year .rep   .sim
##    <chr>        <dbl> <chr> <dbl>
##  1 NAIVE(wheat)  1997 1      150.
##  2 NAIVE(wheat)  1998 1      347.
##  3 NAIVE(wheat)  1999 1      364.
##  4 NAIVE(wheat)  2000 1      346.
##  5 NAIVE(wheat)  2001 1      268.
##  6 NAIVE(wheat)  2002 1      306.
##  7 NAIVE(wheat)  2003 1      327.
##  8 NAIVE(wheat)  2004 1      337.
##  9 NAIVE(wheat)  2005 1      331.
## 10 NAIVE(wheat)  2006 1      318.
## # … with 24,990 more rows
## # ℹ Use `print(n = ...)` to see more rows

wheat_clean %>%
  ggplot(aes(x = year)) +
  geom_line(aes(y = wheat)) +
  geom_line(aes(y = .sim, color = as.factor(.rep)), data = wheat_boot) +
  guides(colour = "none")

wheat_bootclean <- wheat_naive %>%
  forecast(h = 50, bootstrap = TRUE, times = 500)

autoplot(wheat_bootclean, wheat_clean)

#The predict intervals from bootstrapped residuals are reasonable when the residuals do not follow a normal distribution.

Homework_3

Peter Cook

2022-09-27