## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'tsibble'
## 
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
## 
## 
## 
## Attaching package: 'lubridate'
## 
## 
## The following object is masked from 'package:tsibble':
## 
##     interval
## 
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## 
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
## 
## ✔ feasts     0.3.0     ✔ fabletools 0.3.2
## ✔ fable      0.3.2     
## 
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()     masks base::date()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ tsibble::intersect()  masks base::intersect()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ tsibble::setdiff()    masks base::setdiff()
## ✖ tsibble::union()      masks base::union()

5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

global_economy %>%
     filter(Country == "Australia") %>%
autoplot(Population) +
     labs(title = "Population of Australia")

#Since the data trend is rising, we can use a drift!


global_economy %>%
     filter(Country == "Australia") %>%
      model(RW(Population ~ drift())) %>%
       forecast(h = "5 years") %>%
        autoplot(global_economy) +
            labs(y = "Number of People", title = "Population of Australia")

Bricks (aus_production)

aus_production %>%
  autoplot(Bricks)

## Warning: Removed 20 rows containing missing values (`geom_line()`).

# Since we can see some seasonal data, we can can use the seasonal Naive model but first we need to fill in NAs

bricks <- aus_production %>% 
  select(Bricks) %>%
  filter(!is.na(Bricks)) 
    
  
bricks %>% 
  model(SNAIVE(Bricks ~ lag("year"))) %>%
    forecast(h = 5 ) %>%
    autoplot(bricks) +
         labs(title="SNAIVE Forecast ", 
         subtitle = "Five Year Forecast", 
        xlab="Years" )

NSW Lambs (aus_livestock)

nswl <- aus_livestock %>%
  filter(State=="New South Wales" & Animal=="Lambs" & !is.na(Count)) %>%
  select(Count)

nswl %>%
  autoplot(Count)

# For this one we could either use seasonal naive or naive, we will use naive to try it out

nswl %>% 
  model(NAIVE(Count)) %>%
  forecast(h=(1)) %>%
    autoplot(nswl) +
         labs(title="NAIVE Forecast ",  
        xlab="Years" )

Household wealth (hh_budget)

hh_budget %>% 
  autoplot(Wealth)

# Since the trend is rising we can use drift again
hh_budget %>%
   model(RW(Wealth ~ drift())) %>%
   forecast(h = "5 years") %>%
        autoplot(hh_budget) +
        labs(y = "Budget", title = "Budget By Company")

Australian takeaway food turnover (aus_retail).

aus_retail %>%
  filter(Industry=="Takeaway food services") %>%
  autoplot(Turnover)

unique(aus_retail$State)

## [1] "Australian Capital Territory" "New South Wales"             
## [3] "Northern Territory"           "Queensland"                  
## [5] "South Australia"              "Tasmania"                    
## [7] "Victoria"                     "Western Australia"

# For this one, I want to train on all three methods but since there are so many states we will choose one
train_fit <- aus_retail %>% 
  filter(State == "Victoria" & Industry=="Takeaway food services") %>%
  model(Mean = MEAN(Turnover),
  'Seasonal Naïve' = SNAIVE(Turnover),
  'Naïve' = NAIVE(Turnover),
  'Drift' = RW(Turnover ~ drift()
               )
                )

train_fit %>% 
  forecast(h = 1) %>%
  autoplot(aus_retail)

5.2

Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series.

gafa_stock %>%
  filter(Symbol=="FB") %>%
  autoplot(Close) +
    labs(y="$", title="Facebook Closing prices")

Produce forecasts using the drift method and plot them.

gafa_stock %>%
  filter(Symbol=="FB" & !is.na(Close)) %>%
  mutate(Close = as.numeric(Close)) %>%
  update_tsibble(index = Date, regular = TRUE) %>%
  fill_gaps() %>%
  model(RW(Close ~ drift())) %>%
         forecast(h = 90) %>%
            autoplot(gafa_stock) +
             labs(y = "$", title = "Forecasted Facebook Stock Prices")

Show that the forecasts are identical to extending the line drawn between the first and last observations.

I spent three hours trying to figure out how to draw the line and could not figure it out… I’m either dumb or it is not very obvious when using autoplot.

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

gafa_stock2 <- gafa_stock %>%
  filter(Symbol=="FB" & !is.na(Close)) %>%
  mutate(Close = as.numeric(Close)) %>%
  update_tsibble(index = Date, regular = TRUE) %>%
  fill_gaps() %>%
  filter_index("2017-01-01" ~ .)

train_fits <- gafa_stock2 %>%
  model(
  'Mean' = MEAN(Close),
  'Seasonal Naïve' = SNAIVE(Close),
  'Naïve' = NAIVE(Close),
  'Drift' = RW(Close ~ drift())
       )


train_fits %>% 
  forecast(h = 30) %>%
  autoplot(gafa_stock2)

## Warning: Removed 1 row containing missing values (`()`).

## Warning: Removed 2 rows containing missing values (`geom_line()`).

I’m more inclined to believe that the Naive method is the best since stocks famously are hard to predict and do not also follow seasonality and certainly do not follow their mean

5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values (`geom_line()`).

## Warning: Removed 4 rows containing missing values (`geom_point()`).

## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

We can see that the Residuals follow a normal distribution but not perfect, which is one of the requirements for a good model, and with the acf we can see that there are some outlines in lag data as well as a lack of a cycle.

5.3

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

When we ran the analysis before we used Snaive, so we will use that again

bricks <- aus_production %>% 
  select(Bricks) %>%
  filter(!is.na(Bricks)) 
    
  
fit <- bricks %>% model(SNAIVE(Bricks ~ lag("year"))) 

fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values (`geom_line()`).

## Warning: Removed 4 rows containing missing values (`geom_point()`).

## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

We can see that this one follows an even more normal distribution that the Australian beer production and the acf data shows a consistent cycle.

5.7

For your retail time series (from Exercise 8 in Section 2.10):

set.seed(43)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

#a Create a training dataset consisting of observations before 2011 using
myseries_train <- myseries |>
  filter(year(Month) < 2011)

#b Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

#c Fit a seasonal naïve model using SNAIVE() applied to your training data
fit <- myseries_train |>
  model(SNAIVE(Turnover  ~ lag("year")))

#d Check the residuals.
fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values (`geom_line()`).

## Warning: Removed 12 rows containing missing values (`geom_point()`).

## Warning: Removed 12 rows containing non-finite values (`stat_bin()`).

#looks correlated and the residuals follow a normal distribution

#e Produce forecasts for the test data
fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

#f Compare the accuracy of your forecasts against the actual values.
fit |> accuracy()

## # A tibble: 1 × 12
##   State     Indus…¹ .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr>   <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Western … Hardwa… "SNAI… Trai…  3.39  11.5  8.77  6.65  13.9     1     1 0.823
## # … with abbreviated variable name ¹Industry

fc |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model     State Indus…¹ .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>      <chr> <chr>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(T… West… Hardwa… Test   59.2  64.2  59.2  34.7  34.7  6.76  5.60 0.923
## # … with abbreviated variable name ¹Industry

G How sensitive are the accuracy measures to the amount of training data used?

The accuracy measures are sensitive to the amount of training data used. The model performs poorly in comparison with the test data that did not have as many data points points. However, if there were too many points, we would risk training our model on data that as not relevant to our model forecasts. We should aim to reach a Goldilocks state that I find best represented by the acronym KISS.

Homework Ch.5

2023-02-17

5.1