Homework 3

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.3.3

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.5
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.4     ✔ fable       0.4.1
## ✔ ggplot2     3.5.1

## Warning: package 'lubridate' was built under R version 4.3.3

## Warning: package 'tsibble' was built under R version 4.3.3

## Warning: package 'feasts' was built under R version 4.3.3

## Warning: package 'fabletools' was built under R version 4.3.3

## Warning: package 'fable' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

Question 5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

aus_pop <- global_economy %>%
  filter(Country == "Australia") %>%
  select(Country,Year,Population)

aus_pop %>%
  autoplot(Population) +
  labs(title = "Total Australian Population" ,x="Year", y="Population Total")

There is no seasonality so I will fit the Naive and RW models and produce forecasts for each.

# Fit the models 
pop_models <- aus_pop %>%
  model(
    naive = NAIVE(Population),
    rw = RW(Population ~ drift())
        )

# Produce the forecasts
pop_forecasts <- pop_models %>%
  forecast(h="5 years")

# Plot the forecasts with the original data
pop_forecasts %>%
  autoplot(aus_pop, level = NULL) + 
  labs(title = "Australian Population Forecast")

I would use the RW forecast over the naive because it follows the overall trend better and seems more appropriate.

####Bricks (aus_production)

aus_production %>%
  autoplot(Bricks) +
  labs(title = "Austrialian Brick Production", x="Year Quarter", y="Brick Production")

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

aus_bricks <- aus_production %>%
  select(Quarter, Bricks)

There seems to be a bit of seasonality so I will try all 3 models.

brick_model <- aus_bricks %>%
  filter(!is.na(Bricks)) %>%
  model(
    naive = NAIVE(Bricks),
    seasonal_naive = SNAIVE(Bricks),
    rw = RW(Bricks ~ drift())
  )

bricks_forecasts <- brick_model %>%
  forecast(h = 12)
bricks_forecasts %>%
  autoplot(aus_bricks, level= NULL) + 
  labs(title="Australian Brick Production Forecast", x = "Year Quarter", y="Brick Production")

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

Since there does seem to be some seasonality the seasonal naive model would be the most appropriate in this case.

####NSW Lambs (aus_livestock)

aus_lambs <- aus_livestock %>%
  filter(Animal == "Lambs", State == "New South Wales") %>% 
  select(Month, Count)

aus_lambs %>%
  autoplot(Count) + 
  labs(title = "New South Wales Lamb Slaughters", x= "Year Month", y="Number of Slaughters")

There seems to be some seasonality lets try all 3 models.

lamb_model <- aus_lambs %>%
  filter(!is.na(Count)) %>%
  model(
    naive = NAIVE(Count),
    seasonal_naive = SNAIVE(Count),
    rw = RW(Count ~ drift())
  )

lamb_forecasts <- lamb_model %>%
  forecast(h = 12)
lamb_forecasts %>%
  autoplot(aus_lambs, level= NULL) + 
  labs(title="New South Wales Lamb Slaughter Forecast", x = "Year Month", y="Number of Lambs Slaughtered")

Because of the appearance of seasonality I would go with the seasonal naive model as the most appropriate model.

####Household wealth (hh_budget)

us_hh_wealth <- hh_budget %>%
  filter(Country == "USA") %>%
  select(Year, Wealth)
us_hh_wealth %>%
  autoplot(Wealth) +
  labs(title="USA Household Wealth", x="Year", y= "Household Wealth (Percentage of net disposable income)")

There does not seem to be any seasonal trend as we have yearly data so lets try Naive and RW models

wealth_model <- us_hh_wealth %>%
  filter(!is.na(Wealth)) %>%
  model(
    naive = NAIVE(Wealth),
    rw = RW(Wealth ~ drift())
  )

wealth_forecasts <- wealth_model %>%
  forecast(h = 5)
wealth_forecasts %>%
  autoplot(us_hh_wealth, level= NULL) + 
  labs(title="USA Household Wealth Forecast", x = "Year", y="Household Wealth (Percentage of net disposable income)")

Based on the plot I would choose the naive if I wanted a closer forecast so maybe only a year or two out but if i want to go past 2 years I would choose the RW forecast.

####Australian takeaway food turnover (aus_retail)

total_aus_takeaway_turnover <- aus_retail %>% 
  filter(Industry == "Cafes, restaurants and takeaway food services") %>%
  summarise(Total_Turnover = sum(Turnover, na.rm = TRUE)) %>%
  select(Month, Total_Turnover)
 
total_aus_takeaway_turnover %>%
  autoplot(Total_Turnover) + 
  labs(title="Total Australian Food Takeaway Turnover",x="Year Month", y="Total Turnover")

There is a clear upward trend and also seems to be some seasonality so we will try all 3 models but i think the seasonal or RW will be most approproiate.

turnover_model <- total_aus_takeaway_turnover %>%
  filter(!is.na(Total_Turnover)) %>%
  model(
    naive = NAIVE(Total_Turnover),
    seasonal_naive = SNAIVE(Total_Turnover),
    rw = RW(Total_Turnover ~ drift())
  )

turnover_forecasts <- turnover_model %>%
  forecast(h = 24)
turnover_forecasts %>%
  autoplot(total_aus_takeaway_turnover, level= NULL) + 
  labs(title="Total Australia Food Takeaway Turnover", x = "Year Month", y="Total Turnover") +
  guides(colour = guide_legend(title = "Forecast"))

Because of the seasonality I would say that the naive seasonal is the most appropriate model.

###Question 5.2 Use the Facebook stock price (data set gafa_stock) to do the following: #### Part A Produce a time plot of the series.

fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(trading_day = row_number()) %>%
  update_tsibble(index = trading_day, regular = TRUE)

fb_stock %>%
  autoplot(Close) +
  labs(title = "FB Stock Close Prices", x="Trading Day", y="Stock Close Price (Dollars)")

####Part B Produce forecasts using the drift method and plot them.

fb_forecasts <- fb_stock %>%
  model(rw=RW(Close ~ drift())) %>%
  forecast(h=30)

fb_forecasts %>%
  autoplot(fb_stock, level=NULL) +
  labs(title = "FB Closing Stock Price Forecasts", x="Trading Day", y="Stock Closing Price (Dollars)")

####Part C Show that the forecasts are identical to extending the line drawn between the first and last observations.

first_day <- fb_stock %>% slice(1)
last_day <- fb_stock %>% slice(n())
slope <- (last_day$Close - first_day$Close) / (last_day$trading_day - first_day$trading_day)

# Extend the line beyond the last point same as forecast 30 days
future_days <- seq(last_day$trading_day, last_day$trading_day + 30, by = 1)
future_prices <- last_day$Close + slope * (future_days - last_day$trading_day)

# Create tibble for the line
extended_line <- tibble(
  trading_day = c(first_day$trading_day, future_days),
  Close = c(first_day$Close, future_prices)
)

# Plot the stock data, drift forecast, and manually extended line
fb_forecasts %>%
  autoplot(fb_stock, level = NULL) +
  geom_line(data = extended_line, aes(x = trading_day, y = Close), 
            color = "red", linetype = "dashed", size = 1) +  # Extended line
  labs(title = "FB Closing Stock Price Forecasts with Drift Comparison",
       x = "Trading Day", y = "Stock Closing Price (Dollars)")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can see the drift forecast is identical to the line drawn thru the first and last points.

####Part D Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb_forecasts <- fb_stock %>%
  model(
    mean = MEAN(Close),
    naive = NAIVE(Close),
    rw=RW(Close ~ drift())
    ) %>%
  forecast(h=30)

fb_forecasts %>%
  autoplot(fb_stock, level=NULL) +
  labs(title = "FB Closing Stock Price Forecasts", x="Trading Day", y="Stock Closing Price (Dollars)")

Because there’s no seasonality I will try the mean, naive and rw models. From looking at the plot I think that the RW model is the best one to choose. The mean and naive might work ok but I feel the RW does a better job at capturing the overall trend better.

###Question 5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

From the residual plots above we can conclude that they do not look like white noise. In the first plot we see that they are somewhat centered about zero and we can see a seasonal pattern in the residuals again showing that they are not white noise. Then in the ACF plot we can see a few points outside or on the limits showing that there is some correlation which is not what we want. Finally, from the histogram we can see that they are not normally distributed as there is some skewness in the plot which again is not what we want for them to be white noise.

###Question 5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

####Australian Exports series from global_economy

aus_exports <- global_economy %>%
  filter(Country == "Australia") %>%
  select(Country,Exports,Year)

aus_exports %>%
  autoplot(Exports)

Since we have yearly data with no seasonality the Naive method is more appropriate. So, lets use that model.

# Define and estimate a model
aus_exports_fit <- aus_exports |> model(NAIVE(Exports))
# Look at the residuals
aus_exports_fit |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
aus_exports_fit |> forecast() |> autoplot(aus_exports)

The residuals for the Australian Exports do appear to be white noise. We can see from the residual plot that the mean is centered about zero and although there are some flucuations in the variance it is mostly constant with some fluctuations. Then looking at the ACF plot there is only one observation that is outside the limits so they are not correlated which is what we want. Lastly the histogram does show that the residuals are normally distributed which again is exactly what we want for the residuals to be considered white noise.

####Bricks series from aus_production

aus_bricks <- aus_production %>%
  select(Quarter, Bricks)

aus_bricks %>%
  autoplot(Bricks)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

Since we have quarterly data that does seem to have some seasonaity to it I will use the SNAIVE method for this data.

# Define and estimate a model
aus_bricks_fit <- aus_bricks |> model(SNAIVE(Bricks))
# Look at the residuals
aus_bricks_fit |> gg_tsresiduals()

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
aus_bricks_fit |> forecast() |> autoplot(aus_bricks)

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

From the residual plots above we can clearly see that the residuals are not white noise. In the first plot the residuals are not centered about 0 and they do not have constant variance. In the ACF plot there are many points that are outside the limits showing that the residuals are correlated. Finally in the histogram we can see that the distribution of the residuals is not normal and it is skewed. So, in conclusion these residuals are not white noise and there is still information left for the model to pull out.

###Question 5.7 For your retail time series (from Exercise 7 in Section 2.10):

####Part A Create a training dataset consisting of observations before 2011 using

set.seed(777)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries |>
  filter(year(Month) < 2011)
myseries_train

## # A tsibble: 345 x 5 [1M]
## # Key:       State, Industry [1]
##    State           Industry                        `Series ID`    Month Turnover
##    <chr>           <chr>                           <chr>          <mth>    <dbl>
##  1 South Australia Cafes, restaurants and takeawa… A3349816F   1982 Apr     30.1
##  2 South Australia Cafes, restaurants and takeawa… A3349816F   1982 May     30  
##  3 South Australia Cafes, restaurants and takeawa… A3349816F   1982 Jun     28.7
##  4 South Australia Cafes, restaurants and takeawa… A3349816F   1982 Jul     31.4
##  5 South Australia Cafes, restaurants and takeawa… A3349816F   1982 Aug     31.1
##  6 South Australia Cafes, restaurants and takeawa… A3349816F   1982 Sep     31.7
##  7 South Australia Cafes, restaurants and takeawa… A3349816F   1982 Oct     37.8
##  8 South Australia Cafes, restaurants and takeawa… A3349816F   1982 Nov     38.2
##  9 South Australia Cafes, restaurants and takeawa… A3349816F   1982 Dec     47.6
## 10 South Australia Cafes, restaurants and takeawa… A3349816F   1983 Jan     39.5
## # ℹ 335 more rows

####Part B Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

####Part C Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

####Part D Check the residuals. Do the residuals appear to be uncorrelated and normally distributed?

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

The residuals are not normally distributed they are close to being normally distributed but there are some outliers and the distribution is slightly skewed and narrow compared to a normal distribution.

####Part E Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

####Part F Compare the accuracy of your forecasts against the actual values.

fit |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Cafes, … SNAIV… Trai…  3.99  8.30  6.61  4.58  8.54     1     1 0.740

fc |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Cafes, … Test   28.6  38.7  29.9  14.6  15.6  4.52  4.66 0.957

####Part E How sensitive are the accuracy measures to the amount of training data used?

Try adding more training data and taking some away to see the effects on the accuracy.

myseries_train_more <- myseries |>
  filter(year(Month) < 2015)
myseries_train_less <- myseries %>%
  filter(year(Month) < 2008)

fit_more <- myseries_train_more |>
  model(SNAIVE(Turnover))

fit_less <- myseries_train_less |>
  model(SNAIVE(Turnover))

fit_more |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

fit_less |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

fit_more |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Cafes, … SNAIV… Trai…  4.16  8.56  6.87  4.40  8.16     1     1 0.735

fc_more <- fit_more |>
  forecast(new_data = anti_join(myseries, myseries_train_more))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc_more |> autoplot(myseries)

fc_more %>%
  accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Cafes, … Test   27.0  35.8  29.6  12.7  14.3  4.31  4.18 0.826

fit_less %>% accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Cafes, … SNAIV… Trai…  3.51  7.84  6.32  4.45  8.80     1     1 0.793

fc_less <- fit_less |>
  forecast(new_data = anti_join(myseries, myseries_train_less))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc_less |> autoplot(myseries)

fc_less |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Cafes, … Test   43.8  51.3  43.8  24.8  24.8  6.93  6.54 0.946

fc %>% accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Cafes, … Test   28.6  38.7  29.9  14.6  15.6  4.52  4.66 0.957

fc_more %>% accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Cafes, … Test   27.0  35.8  29.6  12.7  14.3  4.31  4.18 0.826

fc_less %>% accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Cafes, … Test   43.8  51.3  43.8  24.8  24.8  6.93  6.54 0.946

We can see above from the accuracy measures that the accuracy is pretty sensitive to the amount of training data used. For the first set where we have training data up to 2012 we see a mean error of 28.57292 then when we use more training data up to 2015 we see a decrease in mean error to 27.00417. So 3 years made the error drop slightly. Then when we use less training data up until 2008 we see the mean error jump to 43.81439. That huge jump shows that the the accuracy measures are pretty sensitive to the amount of training data used.

Homework 3

Joey Bochnik

2025-02-18

Question 5.1

Australian Population (global_economy)