library(fpp3)
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.3     ✔ fable       0.4.1
## ✔ ggplot2     3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ readr   2.1.5
## ✔ purrr   1.0.2     ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

# Australian Population (global_economy)
aus_population <- global_economy %>%
  filter(Country == "Australia")

head(aus_population)
aus_population %>% autoplot(Population) # clear upward trend - will use Randm walk with drift model

aus_population_fit <- aus_population %>%
  model(RW(Population ~ drift()))

aus_population_fc <- aus_population_fit %>%
  forecast(h=10)

aus_population_fc %>% autoplot(aus_population) + 
  labs( title = "Australian Population forecast for 10 years")

#Bricks (aus_production)

bricks <- aus_production %>%
  select(Bricks) %>%
  filter(!is.na(Bricks))

autoplot(bricks, Bricks) #there is seasonality, will proceed with Seasonal NAIVE model

bricks_fit <- bricks %>% model(SNAIVE(Bricks ~ lag("year")))

bricks_fc <- bricks_fit %>% forecast(h = 10)

bricks_fc %>% autoplot(bricks) + 
  labs(title = "Australian bricks production forecast")

#NSW Lambs (aus_livestock)

unique(aus_livestock$Animal) #Lambs
## [1] Bulls, bullocks and steers Calves                    
## [3] Cattle (excl. calves)      Cows and heifers          
## [5] Lambs                      Pigs                      
## [7] Sheep                     
## 7 Levels: Bulls, bullocks and steers Calves ... Sheep
unique(aus_livestock$State) #New South Wales
## [1] Australian Capital Territory New South Wales             
## [3] Northern Territory           Queensland                  
## [5] South Australia              Tasmania                    
## [7] Victoria                     Western Australia           
## 8 Levels: Australian Capital Territory New South Wales ... Western Australia
nsw_lambs <- aus_livestock %>%
  filter(State == "New South Wales", 
         Animal == "Lambs")

autoplot(nsw_lambs, Count) #i believe the seasonal naive model will be the best fit here since it looks like there is some seasonality here

nsw_lambs %>%
  model(SNAIVE(Count)) %>%
  forecast(h = 12) %>%
  autoplot(nsw_lambs)

#Household wealth (hh_budget)
#hh_budget
autoplot(hh_budget, Wealth) #there is a positive trend here so will use rw with drift model

hh_budget %>%
  model(Drift = RW(Wealth ~ drift())) %>%
  forecast(h=5) %>%
  autoplot(hh_budget)

#Australian takeaway food turnover (aus_retail)

unique(aus_retail$Industry)
##  [1] "Cafes, restaurants and catering services"                         
##  [2] "Cafes, restaurants and takeaway food services"                    
##  [3] "Clothing retailing"                                               
##  [4] "Clothing, footwear and personal accessory retailing"              
##  [5] "Department stores"                                                
##  [6] "Electrical and electronic goods retailing"                        
##  [7] "Food retailing"                                                   
##  [8] "Footwear and other personal accessory retailing"                  
##  [9] "Furniture, floor coverings, houseware and textile goods retailing"
## [10] "Hardware, building and garden supplies retailing"                 
## [11] "Household goods retailing"                                        
## [12] "Liquor retailing"                                                 
## [13] "Newspaper and book retailing"                                     
## [14] "Other recreational goods retailing"                               
## [15] "Other retailing"                                                  
## [16] "Other retailing n.e.c."                                           
## [17] "Other specialised food retailing"                                 
## [18] "Pharmaceutical, cosmetic and toiletry goods retailing"            
## [19] "Supermarket and grocery stores"                                   
## [20] "Takeaway food services"
aus_food <- aus_retail %>%
  filter(Industry == "Takeaway food services")

unique(aus_food$State)
## [1] "Australian Capital Territory" "New South Wales"             
## [3] "Northern Territory"           "Queensland"                  
## [5] "South Australia"              "Tasmania"                    
## [7] "Victoria"                     "Western Australia"
autoplot(aus_food, Turnover) #overall upward positive trend, will proceed with rw with the drift model(there might be some seasonaly here as well?) 

aus_food_fc <- aus_food %>%
  model(Drift = RW(Turnover ~ drift())) %>%
  forecast(h = 100)

aus_food_fc %>% autoplot(aus_food) +
  facet_wrap(~State)

5.2

Use the Facebook stock price (data set gafa_stock) to do the following:

gafa_stock
unique(gafa_stock$Symbol)
## [1] "AAPL" "AMZN" "FB"   "GOOG"
fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(trading_day = row_number()) %>%
  update_tsibble(index = trading_day, regular = TRUE)

autoplot(fb_stock, Close)

fb_stock_fit <- fb_stock %>% model(RW(Close ~ drift()))
fb_stock_fc <- fb_stock_fit %>% forecast(h = 100)

fb_stock_fc %>% autoplot(fb_stock)

#Show that the forecasts are identical to extending the line drawn between the first and last observations
fb_max_min <- fb_stock %>% 
  filter(trading_day == min(trading_day) | trading_day == max(trading_day))

autoplot(fb_stock_fc, fb_stock) +
  geom_line(data = fb_max_min, aes(x = trading_day, y = Close), color = "red")

#Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
fb_stock_fit_all <- fb_stock |>
  model(
    Mean = MEAN(Close),
    Naïve = NAIVE(Close),
    Drift = RW(Close ~ drift())
  )
  
fb_stock_fc_all <- fb_stock_fit_all %>% forecast(h=100)

fb_stock_fc_all %>% autoplot(fb_stock, level = NULL) +
   guides(colour = guide_legend(title = "Forecast"))

The mean method assumes that the forecast for all future values is simply the average of all observed values in the historical dataset. Which might work well if there is no trend or seasonality but not in this case. NAIVE is useful also when there is no pattern, so in this case performing drift model will work the best.

5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help. What do you conclude?

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

Overall residuals don’t appear to be randomly scattered.There is a spike at lag4 on ACF plot, which suggests there is autocorrelation in residuals. So I don’t think we are looking at white noise and seasonal naive model is not capturing all patterns in data.

5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

global_economy
aus_exports <- global_economy %>%
  filter(Country == "Australia")

autoplot(aus_exports, Exports) #don't observe seasonality, will apply NAIVE model

# Define and estimate a model
fit_aus_expors <- aus_exports |> model(NAIVE(Exports))
# Look at the residuals
fit_aus_expors |> gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit_aus_expors |> forecast() |> autoplot(aus_exports)

Overall the residuals look like white noise, normal distribution, no autocorrelation but I don’t think naive model is the best fit in this case since it doesn’t take the trend into account.

#Bricks series from aus_production
# example from previous exercise, will fit seasonal naive model since there is clear seasonality in this data
aus_production_clean <- aus_production %>%
  select(Bricks) %>%
  filter(!is.na(Bricks))

# Define and estimate a model
bricks_fit <- aus_production_clean |> model(SNAIVE(Bricks))
# Look at the residuals
bricks_fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
bricks_fit|> forecast() |> autoplot(aus_production_clean)

5.7

For your retail time series (from Exercise 7 in Section 2.10):

set.seed(12345)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

autoplot(myseries)
## Plot variable not specified, automatically selected `.vars = Turnover`

Create a training dataset consisting of observations before 2011 using

myseries_train <- myseries |>
  filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

myseries_fit <- myseries_train |>
  model(SNAIVE(Turnover))

Check the residuals.

myseries_fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

Do the residuals appear to be uncorrelated and normally distributed?

The residuals are not normally distributed, the histogram is not symmetric and not centered at 0.

Produce forecasts for the test data

fc <- myseries_fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)

Compare the accuracy of your forecasts against the actual values.

myseries_fit |> accuracy()
fc |> accuracy(myseries)

How sensitive are the accuracy measures to the amount of training data used?

I will be looking at MAE(mean absolute error). The MAE values of 8.93 and 9.24 indicate that the seasonal NAIVE model’s accuracy slightly improves with more data, as expected. However, the difference is minimal, suggesting the model is fairly robust and not highly sensitive to the amount of data used in this case.