HW 3

library(fpp3)
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.5
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.3.2
## ✔ lubridate   1.9.3     ✔ fable       0.3.4
## ✔ ggplot2     3.5.1     ✔ fabletools  0.4.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

5.1 Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy): RW(y ~ drift()) method because the growth is increasing in a trend

Bricks (aus_production): attempted use all models to see what happens, although I had a feeling due to the seasonality, sNaive is best and has lowest RMSE

NSW Lambs (aus_livestock): also has seasonality so sNaive is best

Household wealth (hh_budget): there were some peaks and troughs but overal a trend so RW(y ~ drift())

Australian takeaway food turnover (aus_retail): initally I wasnt sure between SNAIVE and Drift, there seemed to be seasonality and trends for some countries, so I checked all models and sNaive is best

aus_pop <- global_economy %>%
  filter(Country == "Australia")

autoplot(aus_pop, Population) +
  labs(title = "Australian Population")

aus_pop_model <- aus_pop %>%
  model(RW(Population ~ drift()))

aus_pop_model %>%
  forecast() %>%
  autoplot(aus_pop) +
  labs(title = "Australian Population Forecast")

aus_bricks <- aus_production[,c(1,4)] %>%
  drop_na()

autoplot(aus_bricks, Bricks) +
  labs(title = "Australian Brick Production")

bricks_fit <- aus_bricks %>%
  model(
    Naive = NAIVE(Bricks),
    sNaive = SNAIVE(Bricks),
    Drift = RW(Bricks ~ drift())
  )

# Generate forecasts for the next 10 periods
bricks_fc <- bricks_fit %>%
  forecast(h = 10)

bricks_fc %>%
  autoplot(aus_production) +
  labs(x = "Year", 
       y = "Millions of bricks", 
       title = "Forecast for quarterly bricks production") +
  guides(color = guide_legend(title = "Forecast"))
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

accuracy(bricks_fc, aus_production)
## # A tibble: 3 × 10
##   .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift  Test    NaN   NaN   NaN   NaN   NaN   NaN   NaN    NA
## 2 Naive  Test    NaN   NaN   NaN   NaN   NaN   NaN   NaN    NA
## 3 sNaive Test    NaN   NaN   NaN   NaN   NaN   NaN   NaN    NA
nsw_lambs <- aus_livestock %>%
  filter(State == "New South Wales") %>%
  filter(Animal == "Lambs")

autoplot(nsw_lambs, Count) +
  labs(title = "Lambs in New South Wales")

nsw_lambs_model <- nsw_lambs %>%
  model(SNAIVE(Count))

nsw_lambs_model %>%
  forecast() %>%
  autoplot(nsw_lambs) +
  labs(title = "New South Wales Lambs Forecast")

hh_wealth <- hh_budget[,c(1,2,7)] %>%
  as_tsibble(index = "Year")

autoplot(hh_wealth, Wealth) +
  labs(title = "Household Wealth", y = "Wealth (USD)")

hh_wealth_model <- hh_wealth %>%
  model(RW(Wealth ~ drift()))

hh_wealth_model %>%
  forecast() %>%
  autoplot(hh_wealth) +
  labs(title = "Household Wealth Forecast")

aus_takeaway <- aus_retail %>%
  filter(Industry == "Takeaway food services")

autoplot(aus_takeaway, Turnover) +
  labs(title = "Turnover at Australian Takeaways")

aus_retail_food <- aus_retail %>%
  filter(Industry == "Takeaway food services") %>%
  index_by() %>%
  group_by(Industry) %>%
  summarise(Turnover = sum(Turnover)) %>%
  ungroup()

food_tr <- aus_retail_food %>%
  filter(year(Month) <= 2017)

food_fit <- food_tr %>%
  model(
    Naive = NAIVE(Turnover),
    SNaive = SNAIVE(Turnover),
    Drift = RW(Turnover ~ drift())
  )

food_fc <- food_fit %>%
  forecast(h = 12)

food_fc %>%
  autoplot(food_tr, level = NULL)

accuracy(food_fc, aus_retail_food)
## # A tibble: 3 × 11
##   .model Industry        .type     ME  RMSE   MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr>           <chr>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift  Takeaway food … Test  -175.  195.  176.  -11.6  11.7  3.94  3.43  0.255
## 2 Naive  Takeaway food … Test  -152.  179.  160.  -10.2  10.6  3.58  3.16  0.353
## 3 SNaive Takeaway food … Test    33.4  43.1  37.9   2.10  2.40 0.849 0.760 0.269

5.2 Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series. First reindex the timeseries since it does not trade everyday.

Produce forecasts using the drift method and plot them. Initially unsure whether to use close or open price

Show that the forecasts are identical to extending the line drawn between the first and last observations. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why? Historically its been increasing, in the more recent data it is decreasing so the model should take that into account therefore Naive is best.

fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(day = row_number()) %>%
  update_tsibble(index = day, regular = TRUE)

fb_stock%>%
  autoplot(Open) +
  labs(title= "Daily Open Price of Facebook", y = "USD")

fb_stock %>%
model(Drift = RW(Close ~ drift())) %>%
  forecast(h = 30) %>%
  autoplot(fb_stock) +
    labs(title = "Facebook Close Price Forcast")

fb_stock %>%
  model(RW(Close ~ drift())) %>%
  forecast(h = 63) %>%
  autoplot(fb_stock) +
  labs(title = "Daily Close Price of Facebook", y = "USD") +
  geom_segment(aes(x = 1, y = 54.83, xend = 1258, yend = 134.45),
               colour = "blue", linetype = "dashed")
## Warning in geom_segment(aes(x = 1, y = 54.83, xend = 1258, yend = 134.45), : All aesthetics have length 1, but the data has 1258 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

fb_fit <- fb_stock %>%
  model(
    Naive = NAIVE(Close),
    SNaive = SNAIVE(Close ~ lag(30)),
    Drift = RW(Close ~ drift())
  )

# Generate forecasts for the next 63 periods
fb_fc <- fb_fit %>%
  forecast(h = 63)

# Plot the forecasts using autoplot
autoplot(fb_fc, level = NULL) +
  labs(title = "Daily Close Price of Facebook", y = "USD")

5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help. What do you conclude? The residuals are normally distributed, q1 is mostly in the blue line areas however lag 4 is clearly not, and lag 3 and 10 are very close suggesting it is not white noise

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Global_economy: used NAIVE due to trend, the forecast looks good. The residuals are normally distributed; however it is likely white noise.

Bricks: used SNAIVE due to seasonality, the residuals show that they are not white noise and they are not normally distributed.

# Extract data of interest
aus_economy <- global_economy %>%
    filter(Country == "Australia") 
# Define and estimate a model
fit <- aus_economy %>% model(NAIVE(Exports))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit %>% forecast() %>% autoplot(aus_economy)

fit <- aus_production %>%
  filter(!is.na(Bricks)) %>%
  model(SNAIVE(Bricks ~ lag("year")))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit %>% forecast() %>% autoplot(aus_production)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

5.7 For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
  filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red") +
  labs(title = "Turnover Data with Training Set Highlighted",
       y = "Turnover",
       x = "Time")

fit <- myseries_train %>%
  model(SNAIVE(Turnover))
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)

Compare the accuracy of your forecasts against the actual values. The training data has smaller errors but they are close in values, for exmaple RMSE is lower for the training dataset.

fit |> accuracy()
## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439  1.21 0.915  5.23  12.4     1     1 0.768
fc |> accuracy(myseries)
## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test  0.836  1.55  1.24  5.94  9.06  1.36  1.28 0.601

How sensitive are the accuracy measures to the amount of training data used? The amount of data used impacts the sensitivity, ideally want to find the optimal amount, like with kfold validation, as addding to much lease to overfit and too little to underfitting.