DATA 624 Homework 3

Exercise 5.1

## Australian Population // global_economy
global_economy %>%
  filter(Country == "Australia") %>% # filter for Australia
  model(RW = RW(GDP ~ drift())) %>% # use the drift method
  forecast(h = 10) -> pop_fc # generate forecast for 10 years

# plot the forecast
autoplot(pop_fc, global_economy) + 
  labs(title ="10-Year Forecast for Australian Population") +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

## Bricks // aus_production
aus_production %>%
  filter(!is.na(Bricks)) %>% # filter out missing data
  model(SNAIVE= SNAIVE(Bricks)) %>% # use the SNAIVE method
  forecast(h = 8) -> bricks_fc # generate forecast for 8 quarters

# plot the forecast
autoplot(bricks_fc, aus_production) + 
  labs(title ="Forecast for Australian Brick Production") +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

## NSW Lambs // aus_livestock
aus_livestock %>%
  filter(State == "New South Wales") %>% # filter for NSW
  filter(Animal == "Lambs") %>% # filter for lambs
  model(SNAIVE = SNAIVE(Count)) %>% # use the SNAIVE method
  forecast(h = 24) -> lambs_fc # generate forecast for 24 months

# plot the forecast
autoplot(lambs_fc, aus_livestock) +
  labs(title = "Forecast for NSW Lamb Slaughter") +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

## Household Wealth // hh_budget
hh_budget %>%
  model(RW = RW(Wealth ~ drift())) %>% # use the drift method
  forecast(h = 2) -> wealth_fc # forecast for 2 years

# plot the forecast
autoplot(wealth_fc, hh_budget) +
  labs(title = "Forecast for Household Budget") +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

## Australian Takeaway Food Turnover // aus_retail
aus_retail %>%
  filter(Industry == "Cafes, restaurants and takeaway food services") %>% # filter for takeaway services
  filter(State == "Australian Capital Territory") %>% # filter for Australian Capital Territory
  model(RW = RW(Turnover ~ drift())) %>% # use the drift method
  forecast(h = 12) -> takeaway_fc # forecast for 12 months

# plot the forecast
autoplot(takeaway_fc, aus_retail) +
  labs(title = "Forecast for Australian Takeaway Food Turnover") +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

Exercise 5.2

a. Produce a time plot of the series

# filter for FB stock
fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  # re-index based off trading days --> using example from textbook as guide
  mutate(day = row_number()) %>%
  update_tsibble(index = day, regular = TRUE)

# plot the series
autoplot(fb_stock, Close) +
  labs(
    title = "Time Series of Facebook Stock"
  ) +
  xlab("Trading Day") +
  ylab("Close Price ($)") +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

b. Produce forecasts using the drift method and plot them

fb_stock %>%
  model(RW = RW(Close ~ drift())) %>% # use the drift method
  forecast(h = 365) -> fbstock_fc # forecast for 365 days

# plot the forecast
autoplot(fbstock_fc, fb_stock) +
  labs(title = "Forecast for Facebook Stock\nUsing Drift Method") +
  xlab("Trading Day") +
  ylab("Close Price ($)") +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

c. Show that the forecasts are identical to extending the line drawn between the first and last observations.

# compute first/last and slope
n   <- nrow(fb_stock)
y1  <- dplyr::first(fb_stock$Close)
yT  <- dplyr::last(fb_stock$Close)
m   <- (yT - y1) / (n - 1) # slope

# build the straight line
seg_in <- tibble(
  day   = c(1, n),
  Close = c(y1, yT)
)

# build the manual straight line into the forecast
H <- 365
manual_fc <- tibble(
  day   = n + 1:H,
  Close = yT + m * (1:H)
)

# plot
autoplot(fbstock_fc, fb_stock) +
  geom_line(data = seg_in, aes(day, Close),
            linetype = "dotted", linewidth = 0.7, colour = "red") +
  geom_line(data = manual_fc, aes(day, Close),
            linetype = "dashed", linewidth = 0.8, colour = "black") +
  labs(
    title = "FB Stock Drift Forecast Equals Straight-Line Extension",
    subtitle = "Red dotted = line between first and last observation;\nBlack dashed = its forward extension",
    x = "Trading Day", y = "Close Price ($)"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust=0.5, face = 'italic')
    )

d. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb_stock %>%
  model(
    mean = MEAN(Close), # mean
    naive = NAIVE(Close), # naive
    snaive = SNAIVE(Close), # snaive
    RW = RW(Close ~ drift()) # drift
  ) %>%
  forecast(h = 180) -> various_fc # forecast for 180 days

autoplot(various_fc, fb_stock) +
  labs(
    title = "Facebook Stock Forecasts"
  ) +
  xlab("Trading Day") +
  ylab("Closing Price ($)") +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold')
  )

Drift seems like the best option since the overall trend of the stock price is upward.

Exercise 5.3

# Extract data of interest
recent_production <- aus_production %>% 
  filter(year(Quarter) >= 1992)

# Define and estimate a model
model <- recent_production %>%
  model(SNAIVE(Beer)) # SNAIVE method

# Look at the residuals
model %>%
  gg_tsresiduals()

# Look a some forecasts
model %>% 
  forecast() %>%
  autoplot(recent_production) +
  labs(
    title = "Beer Production in Australia",
    subtitle = "Since 1992"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = 'bold'),
    plot.subtitle = element_text(hjust = 0.5, face = 'italic')
  )

From the top plot, the residuals seem to be centered around zero and there is a clear seasonal trend. There is a significant spike at lag-4 on the ACF plot, which indicates autocorrelation. We can confirm that the residuals are not white noise by applying the Ljung-Box test:

augment(model) %>%
  features(.innov, ljung_box, lag=10)

## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    37.8 0.0000412

The p-value is very small, so we can confirm that the residuals are not white noise.

Exercise 5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

# Define and estimate a model
australian_exports <- global_economy %>%
  filter(Country == "Australia") %>% # filter for Australia
  model(naive = NAIVE(Exports)) # apply NAIVE method

# Look at the residuals
australian_exports %>%
  gg_tsresiduals()

# Look a some forecasts
australian_exports %>% 
  forecast(h = 10) %>% # look at next 10 years
  autoplot(global_economy %>% filter(Country == "Australia")) +
    labs(
      title = "Australian Exports"
    ) +
    theme(
      plot.title = element_text(hjust = 0.5, face = 'bold')
    )

From the top plot, the residuals are scattered about 0 with no obvious trend. There is one spike outside the blue dotted line on the ACF plot, but it is not large enough to say that these residuals are not white noise. The residuals are normally distributed according to the histogram.

Using the Ljung-Box test again:

augment(australian_exports) %>%
  features(.innov, ljung_box, lag=10)

## # A tibble: 1 × 4
##   Country   .model lb_stat lb_pvalue
##   <fct>     <chr>    <dbl>     <dbl>
## 1 Australia naive     16.4    0.0896

The p-value is 0.08963678, which is large. We can confirm that the residuals are white noise.

# Define and estimate a model
bricks <- aus_production %>%
  filter(!is.na(Bricks)) %>% # filter out missing data
  model(snaive = SNAIVE(Bricks)) # apply SNAIVE method; SNAIVE is used because there is strong seasonality in this data

# Look at the residuals
bricks %>%
  gg_tsresiduals()

# Look a some forecasts
bricks %>% 
  forecast(h = 8) %>% # look at next 8 quarters
  autoplot(aus_production) +
    labs(
      title = "Australian Brick Production"
    ) +
    theme(
      plot.title = element_text(hjust = 0.5, face = 'bold')
    )

According to the top plot, there is no obvious trend with the residuals. From the ACF plot, we can see that there are many spikes outside the blue dotted line. This indicates that there is autocorrelation, so the data are not white noise. We can confirm this with the Ljung-Box test.

augment(bricks) %>%
  features(.innov, ljung_box, lag=10)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 snaive    301.         0

The p-value is low, so we can confirm that the residuals are not white noise.

Exercise 5.7

library(readxl)
retail <- readxl::read_excel("/Users/kristinlussi/Documents/MSDS/DATA 624/retail.xlsx", skip = 1)

myseries <- retail %>%
  select(Month = `Series ID`, Turnover = A3349398A) %>%
  mutate(Month = yearmonth(Month)) %>%   # convert to yearmonth
  as_tsibble(index = Month)

# part a - from text
myseries_train <- myseries |>
  filter(year(Month) < 2011)

# part b - from text
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

# part c - from text
fit <- myseries_train %>%
  model(SNAIVE())

# part d - from text
fit %>%
  gg_tsresiduals()

Do the residuals appear to be uncorrelated and normally distributed?:

According to the ACF plot, there are significant spikes between lag-0 and lag-9. This indicates that there is autocorrelation. According to the top plot, the residuals are not centered around zero and the variation is not consistent overtime. The histogram is right-skewed, so the distribution is not normal. In conclusion, the residuals are not uncorrelated and not normally distributed.

# part e - from text
fc <- fit %>%
  forecast(new_data = anti_join(myseries, myseries_train))
fc %>%
  autoplot(myseries)

# part f - from text
fit %>%
  accuracy()

## # A tibble: 1 × 10
##   .model   .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE() Training  73.9  88.3  75.1  6.07  6.13     1     1 0.631

fc %>%
  accuracy(myseries)

## # A tibble: 1 × 10
##   .model   .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE() Test   160.  180.  160.  6.03  6.03  2.13  2.04 0.569

The RMSE more than doubles from training to testing, which shows that the model is not very accurate. The MASE being greater than 1 indicates that this model performs worse than a naive model.

Part g. How sensitive are the accuracy measures to the amount of training data used?:

The RMSE, MAE, and MASE nearly doubled from the training to testing set, which means that these measures are very sensitive to the amount of training data used. The model’s performance declines when it is applied to unseen data.