Homework 3

## import necessary packages
library(fpp3)
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.4
## ✔ dplyr       1.1.3     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.0     ✔ feasts      0.3.1
## ✔ lubridate   1.9.3     ✔ fable       0.3.3
## ✔ ggplot2     3.4.4     ✔ fabletools  0.4.0
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

Question 1

## Question 1
australia <- global_economy %>%
  filter(Country == "Australia")
australia %>%
  model(NAIVE(Population)) %>%
  forecast(h = 10) %>%
  autoplot(australia)

australia %>%
  model(RW(Population ~ drift())) %>%
  forecast(h = 20) %>%
  autoplot(australia) +
  labs(title = "Population Growth in Australia")

The optimal method for the australian data would be drift method. We can see that population growth appears to be linear, which drift is suited for.

brick_fit <- aus_production %>%
  filter(!is.na(Bricks)) %>%
  model(
    Seasonal_naive = SNAIVE(Bricks)
  )
brick_fc <- brick_fit %>%
  forecast(h = "5 years")

brick_fc %>%
  autoplot(aus_production, level = NULL) +
  labs(title = "Brick Production  in Australia", y = "Millions of Bricks") + 
  guides(colour = guide_legend(title = "Forecast"))
## Warning: Removed 20 rows containing missing values (`geom_line()`).

The optimal method would likely be the seasonal naive model for the brick production in Australia. The brick production in Australia most likely peaks in the summer, when lots of construction is done, so a seasonal model makes sense.

nsw_lambs <- aus_livestock %>% filter(Animal == "Lambs" & State == "New South Wales")
lambs_fit <- nsw_lambs %>% 
  filter(!is.na(Count)) %>%
  model(seasonal_naive = SNAIVE(Count),
        naive = NAIVE(Count),
        drift = RW(Count ~ drift())
        )
lambs_fc <-lambs_fit %>%
  forecast(h = "10 years")
lambs_fc %>%
  autoplot(nsw_lambs, level = NULL) + 
  labs(title = "Lamb Counts in Australia") + 
  guides(color = guide_legend((title = "Forecast")))

I decided to plot all 3 models togther for this dataset. Based on all of the models plotted together, seasonal naive seems to be the most appropriate, as there is clearly a seasonal element to the data in question.

aus_wealth <- hh_budget %>% 
  filter(Country == "Australia") 
wealth_fit <- aus_wealth %>% 
  filter(!is.na(Wealth)) %>%
  model(seasonal_naive = SNAIVE(Wealth),
        naive = NAIVE(Wealth),
        drive = RW(Wealth ~ drift()))
## Warning: 1 error encountered for seasonal_naive
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.
wealth_fc <- wealth_fit %>%
  forecast(h = "5 years")
wealth_fc %>%
  autoplot(aus_wealth, level = NULL) +
  labs(title = "Household Wealth in Australia")
## Warning: Removed 5 rows containing missing values (`geom_line()`).

wealth_fc %>% filter(.model == "drive") %>%
  autoplot(aus_wealth) + labs(title = "Household wealth in Australia")

I believe that the drive model is the most appropriate for this, but I don’t think it is that great of a model in general. Unlike the Australian population model, there are historical periods in which the household wealth plummets, which none of the models are capable of predicting.

food_turnover <- aus_retail %>%
  filter(Industry == "Takeaway food services" & State == "Australian Capital Territory")
food_fit <- food_turnover %>%
  filter(!is.na(Turnover)) %>%
  model(naive = NAIVE(Turnover),
        seasonal_naive = SNAIVE(Turnover),
        drive = RW(Turnover ~ drift()))
food_fc <- food_fit %>%
  forecast(h = "5 years")
food_fc %>%
  autoplot(food_turnover, level = NULL)

I think that the drive model would be the most appropriate of the 3. This is because there is clearly an overall upward trend in the data, which the drive model predicts the best of the 3.

Question 2

A.)

facebook <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(trading_day = row_number()) %>%
  update_tsibble(index = trading_day, regular = TRUE)
facebook %>% autoplot(Close) +
  labs(title = "Facebook Closing Price")

B.)

facebook_fit <- facebook %>% 
  model(drift = RW(Close ~ drift()))
facebook_fc <- facebook_fit %>%
  forecast(h = 100)
facebook_fc %>%
  autoplot(facebook, level = NULL) +
  labs(title ="Facebook Adjusted Closing Price Forecast")

C.)

facebook_slope <- ((facebook$Close[1258] - facebook$Close[1])/(1258 - 1))
print(facebook_slope)
## [1] 0.06076372
facebook_fc %>%
  autoplot(facebook, level = NULL) +
  labs(title = "Facebook Adjusted Closing Price Forecast") +
  geom_abline(slope = facebook_slope, intercept = 54.71, alpha = 0.3)

We can see that when the slope is extended,it falls in the same spot as the drift model.

D.)

facebook_fit <- facebook %>% 
  model(naive = NAIVE(Close),
    drift = RW(Close ~ drift(), 
    seasonal_naive = SNAIVE(Close), 
    mean = MEAN(Close)))
facebook_fc <- facebook_fit %>%
  forecast(h = 200)
facebook_fc %>%
  autoplot(facebook, level = NULL) + 
  labs(title ="Facebook Adjusted Closing Price Forecast")

I don’t think that any of the models are particularly appropriate for predicting the facebook stock price. The naive model seems to be the best though. This is because the ticker price data is in a downward trend, but the drift model is predicting upward movement in the stock. The naive model simply predicts sideways action which seems more likely. There is not a seasonal element to the data, so SNAIVE would not be correct.

Question 3

# Extract data of interest
recent_production <- aus_production %>%
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production %>% model(SNAIVE(Beer))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 4 rows containing missing values (`geom_line()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

# Look a some forecasts
fit %>% forecast() %>% autoplot(recent_production)

Based on just the plot of innovation residuals, they appear to be random noise. Although ACF plot does show that there is correlation at the 4Q (1 year) interval, this is only one outlier, so i would still interpret it as white noise.

4

recent_exports <- global_economy %>% 
  filter(Country == "Australia" & Year >= 1992)
fit_exports <- recent_exports %>% model(NAIVE(Exports))
fit_exports %>% gg_tsresiduals()
## Warning: Removed 1 row containing missing values (`geom_line()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).

fit_exports %>% forecast(h = 10) %>% 
  autoplot(recent_exports)

Based on the gg_tresiduals() function, the residuals look like white noise. The innovation residuals plot is centered around 0 while the lag plot shows that there is not any significant autocorrelation to the data.The residuals are also normally distributed

recent_bricks <- aus_production %>% 
  filter(year(Quarter) >= 1992 & !is.na(Bricks))
fit_bricks <- recent_bricks %>% 
  model(seaonal_naive = SNAIVE(Bricks))
fit_bricks %>% 
  gg_tsresiduals()
## Warning: Removed 4 rows containing missing values (`geom_line()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

fit_bricks %>% 
  forecast(h = "5 years") %>%
  autoplot(recent_bricks) +
  labs(title = "Brick Production Forecast")

From this data it is clear the the residuals are not white noise. The distribution of the residuals is not close to normal, and we can see that they are not centered around 0. It is clear from the innovation residuals that they are not random.

7

A.)

set.seed(12645678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries |>
  filter(year(Month) < 2011)

B.)

autoplot(myseries, Turnover) + 
  autolayer(myseries_train, Turnover,colour = "red")

Based on this plot, the data is split correctly

C

fit <- myseries_train |>
  model(SNAIVE(Turnover))

D

fit %>% gg_tsresiduals()
## Warning: Removed 12 rows containing missing values (`geom_line()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).
## Warning: Removed 12 rows containing non-finite values (`stat_bin()`).

The residuals most likely have correlation to them. The distribution of the residuals is not quite normal, with a tail on the right side. The distribution of the residuals also doesn’t appear to be a mean of 0, as they appear to mostly be above 1. It also appears that there is some autocorrelation based on the acf plot, where many values fall outside of the interval.

E

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries, level = NULL)

F.)

fit |> accuracy()
## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Clothin… SNAIV… Trai…  2.87  6.26  4.81  4.24  9.16     1     1 0.695
fc |> accuracy(myseries)
## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Clothin… Test  -4.06  12.0  8.80 -5.47  8.91  1.83  1.91 0.627

We can see that the accuracy of our model for predicting turnover is not very good. The mean absolute square error (MASE) is about 1.83 which means that a one-step naive forcast would have been a better forecasting method for the data.

G.)

myseries_train_less <- myseries %>%
  filter(year(Month) < 2000)
fit <- myseries_train_less %>%
  model(SNAIVE(Turnover))
fc <- fit %>%
  forecast(new_data = anti_join(myseries, myseries_train_less))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fit %>% accuracy()
## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Clothin… SNAIV… Trai…  1.22  4.50  3.58  2.59  9.35     1     1 0.748
fc %>% accuracy(myseries)
## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Clothin… Test   37.7  43.7  38.0  39.9  40.4  10.6  9.72 0.859

To show how sensitive the accuracy to the amount of training data used, I trained the model on 11 fewer years of data. While the MASE of the first model was 1.83, the MASE of this model is 10.61, over 5 times larger. The accuracy measures are extremely sensitive to the amount of training data that is being used.By plotting the model we can also visually see just how much worse the model has gotten.

fc |> autoplot(myseries, level = NULL)