#Import needed libraries

library(tsibbledata) #to use the time series data in it for the exercises.
library(tsibble) # to use datasets and function as_tsibble
library(tibble) # to use view function
library(ggplot2)
library(feasts) # to use the functions for graphics like autoplot()

library(readr) # to uses read_csv function
library(dplyr) # to use Filter, mutate, arrange function etc
library(tidyr) # to use pivot_longer function

library(USgas) # to use us_total data

library(fpp3)  # to use us_gasoline dataset 
library(seasonal) # X-13ARIMA-SEATS decomposition
library(feasts)

Exercises:

5.1 Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

##Australian Population (global_economy)

Answers:

The time series shows a clear upward trend with no seasonality. i would look at drift forecasting method in this case.The drift method is to allow the forecasts to increase or decrease over time, where the amount of change over time (called the drift) is set to be the average change seen in the historical data.

#visualize the data
global_economy |>
  filter(Country == "Australia")  |>
autoplot(Population) +
  labs(title= "Australia Population", y = "Total Population") +
  scale_x_continuous(breaks = seq(min(global_economy$Year), max(global_economy$Year), by = 5))+
  scale_y_continuous(labels = scales::comma)

#Filter to Australia and create a ts tibble
aus_population <- global_economy |>
  filter(Country == "Australia")  |>
  select(Population)


#set training data
aus_population_train <- aus_population |>
    filter_index("1960" ~ "2010") 


#fit the training model
aus_population_fit  <- aus_population_train |>
  model(
#    Mean = MEAN(Population),
#    `Naïve` = NAIVE(Population),
#    `Seasonal naïve` = SNAIVE(Population ~ lag("Year")),
    `Drift` = RW(Population ~ drift())
  )

# Generate forecasts for 10 years
aus_population_fc <- aus_population_fit  |> forecast(h = 10)

# Plot forecasts against actual values
aus_population_fc |>
  autoplot(aus_population_train, level = NULL) +
  autolayer(
    filter_index(aus_population, "2011" ~ .),
    colour = "black"
  ) +
  labs(
    y = "Total Population",
    title = "Drift Forecast for Annual Australia population"
  ) +
  scale_x_continuous(breaks = seq(min(global_economy$Year), max(global_economy$Year), by = 5))+
  scale_y_continuous(labels = scales::comma) +
  guides(colour = guide_legend(title = "Forecast"))

Brick production from aus_production.

Answers:

The time series shows a upward trend with seasonality behavior. There are seasonal fluctuations that grow with the level of the series. The Seasonal Naive forecasting method may be appropriate in this case.

# removing the missing values from the dataset
missing_values <- sum(is.na(aus_production))
print(missing_values)
## [1] 44
# Load the dataset
bricks <- aus_production |>
  drop_na(Bricks) |>
 select(Bricks)

# Plot the data
bricks |>
  drop_na(Bricks) |>
  autoplot(Bricks) +
  labs(title = "Quarterly production of Bricks in Australia", y = "Brick production in millions of bricks")

# Set training data
bricks_train <- bricks |>
    drop_na(Bricks) |>
  filter_index("1970 Q1" ~ "2005 Q1") 

# Fit the training model
bricks_fit  <- bricks_train |>
  model(
    `Seasonal naïve` = SNAIVE(Bricks)
  
  )

# Generate forecasts for 12 quarters
bricks_fc <- bricks_fit  |> forecast(h = 12)

# Plot forecasts against actual values
bricks_fc |>
  autoplot(bricks_train, level = NULL) +
  autolayer(
    filter_index(bricks, "2005 Q1" ~ .),
    colour = "black"
  ) +
  labs(
    y = "Brick production in millions of bricks",
    title = "Seasonal Naive Forecast for Quarterly production of Bricks in Australia"
  ) +
  scale_y_continuous(labels = scales::comma) +
  guides(colour = guide_legend(title = "Forecast"))

NSW Lambs (aus_livestock)

Answer:

The time series shows a changing trend with upward and downward trend. There is strong seasonality behavior. There are seasonal fluctuations that change with the level of the series. The Seasonal Naive forecasting method may be appropriate in this case.

# Load the dataset
nsw_lambs <- aus_livestock |>
filter(State == "New South Wales" & Animal == "Lambs")


# Plot the data
nsw_lambs |>

autoplot(Count) +
  labs(title= "NSW Lambs production in Australia for human consumption", y = "Number of NSW Lambs slaughtered")+
  scale_y_continuous(labels = scales::comma)

# Set training data
nsw_lambs_train <- nsw_lambs |>
  filter_index("1980 Jan" ~ "2018 Dec") 

# Fit the training model
nsw_lambs_fit  <- nsw_lambs_train |>
  model(
    `Naïve` = SNAIVE(Count)
  
  )

# Generate forecasts for 12 months
nsw_lambs_fc <- nsw_lambs_fit  |> forecast(h = 12)

# Plot forecasts against actual values
nsw_lambs_fc |>
  autoplot(nsw_lambs_train, level = NULL) +
  autolayer(
    filter_index(nsw_lambs, "2018 Jan" ~ .),
    colour = "black"
  ) +
  labs(
    y = "Brick production in millions of bricks",
    title = "Forecast for Quarterly production of Bricks in Australia"
  ) +
  scale_y_continuous(labels = scales::comma) +
  guides(colour = guide_legend(title = "Forecast"))

Household wealth (hh_budget)

Answer

The time series has a upward trend with few fluctuations. However there does not appear to be any seasonality or cycle behavior.I would look at drift forecasting method in this case.The drift method is to allow the forecasts to increase or decrease over time, where the amount of change over time (called the drift) is set to be the average change seen in the historical data.

# Plot the data
hh_budget |>
autoplot(Wealth) +
  labs(title= "Household budgets for Australia, Japan, Canada and USA from 1995-2016", y = "Wealth as a percentage of net disposable income")+
  scale_y_continuous(labels = scales::comma)

# Set training data
hh_budget_train <- hh_budget  

# Fit the training model
hh_budget_fit  <- hh_budget_train |>
  model(
    `Drift` = RW(Wealth ~ drift())
  
  )

# Generate forecasts for 4 years
hh_budget_fc <- hh_budget_fit  |> forecast(h = 4)

# Plot forecasts against actual values
hh_budget_fc |>
  autoplot(hh_budget_train, level = NULL) +
  autolayer(
    filter_index(hh_budget, "2016" ~ .),
    colour = "black"
  ) +
  labs(
    y = "Wealth as a percentage of net disposable income",
    title = "Drift forecast for Household budgets for Australia, Japan, Canada and USA"
  ) +
  scale_y_continuous(labels = scales::comma) +
  guides(colour = guide_legend(title = "Forecast"))

Australian takeaway food turnover (aus_retail).

Answer

The time series shows a general upward trend. There is strong seasonality behavior. There are seasonal fluctuations that change with the level of the series. The Seasonal Naive forecasting method may be appropriate in this case.

# Plot the data
myseries <- aus_retail |>
  filter(Industry == "Cafes, restaurants and takeaway food services") 


myseries %>%
  ggplot(aes(x = Month, y = Turnover)) +
  geom_line() +
  labs(y = "Takeaway food turnover in $Million AUD", title = "Australian takeaway food turnover") +
  scale_y_continuous(labels = scales::comma) + 
  facet_wrap(~State, scales = "free_y")

# Set training data
turnover_train <- myseries

# Fit the training model
turnover_fit  <- turnover_train |>
  model(
    `Seasonal naïve` = SNAIVE(Turnover)
  
  )

# Generate forecasts for 36 MONTHS
turnover_fc <- turnover_fit  |> forecast(h = 36)

# Plot forecasts against actual values
turnover_fc |>
  autoplot(turnover_train, level = NULL) +
  
  labs(
    y = "Takeaway food turnover in $Million AUD", title = "Seasonal naïve forecast(blue) of Australian takeaway food turnover") +
  scale_y_continuous(labels = scales::comma) +
  guides(colour = guide_legend(title = "Forecast")) +
   facet_wrap(~State, scales = "free")

5.2 Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series.

Produce forecasts using the drift method and plot them.

Show that the forecasts are identical to extending the line drawn between the first and last observations.

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

Answer:

After restricting GAFA Stock table to facebook , i used the closing price (Close) as the target . The time plot shows a general upward trend with no seasonality or cyclic behavior. I produced the forecast using the drift method. I showed that the drift forecast is identical to extending the line drawn between the first and last observations. I used other benchmark functions to forecast the same dataset. Of the 4 methods, Drift forecasting method is best representative of the time plot. As the time series is not seasonal SNAIVE is not applicable. As the stock price cannot remain constant and will fluctate , other methods do not seem appropriate.

#Load the dataset
 facebook <- gafa_stock |>
  filter( Symbol == "FB")

#Visualize the data
autoplot(facebook, Close)+
  labs( y = "The closing price for the stock in $USD", title = "Historical stock prices from 2014-2018 for Facebook")+
      scale_y_continuous(labels = scales::comma)

# Set training data  and as it is a irregular time series as stock is traded on trading days only, i am filling the gaps with NA
facebook_train <- facebook |>
   update_tsibble(regular = TRUE) |>
  fill_gaps()

# Fit the drift model
facebook_fit <- facebook_train |>
  model(
     `Drift` = RW(Close ~ drift())
    
  )
# Generate forecasts for a year
facebook_fc <- facebook_fit |> forecast(h = 364)

# Plot forecasts against actual values
facebook_fc |>
  autoplot(facebook_train, level = NULL) +
  autolayer(facebook,
    colour = "black"
  ) +
  labs(y = "The closing price for the stock in $USD",
    title = "Drift Forecast(blue) for Facebook closing stock price"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Open`

#Show that the forecasts are identical to extending the line drawn between the first and last observations
first_date = min(facebook$Date)
last_date = max(facebook$Date)
first_price = facebook$Close[which.min(facebook$Date)]
last_price = facebook$Close[which.max(facebook$Date)] 
  
 facebook_fc |>
 autoplot(facebook_train, level = NULL)  +
  labs(title = "Drift forecast (blue) of Facebook closing stock price",
       subtitle = "The green dotted line indicating that the forecasts are identical to extending the line drawn between the first and last observations " ,y = "Stock Price ($)") +
  geom_segment(aes(x = first_date, y = first_price, xend = last_date, yend = last_price),
               colour = "green", linetype = "dashed")

# Try using some of the other benchmark functions to forecast the same data set
# Fit the drift model
facebook_fit_all <- facebook_train |>
  model(
    Mean = MEAN(Close),
    `Naïve` = NAIVE(Close),
    `Seasonal naïve` = SNAIVE(Close),
   `Drift` = RW(Close ~ drift())
    
  )
# Generate forecasts for a year
facebook_fc <- facebook_fit_all |> forecast(h = 364)

# Plot forecasts against actual values
facebook_fc |>
  autoplot(facebook_train, level = NULL) +
  autolayer(facebook,
    colour = "black"
  ) +
  labs(y = "The closing price for the stock in $USD",
    title = "Drift Forecast(blue) for Facebook closing stock price"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Open`

5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.What do you conclude?

Answer:

The residual results are not white noise, as the residuals seem to be centered around zero and follow a constant variance except for 2 peaks which are the lowest. The histogram of the residuals displays a bi-modal distribution, but more towards normal distribution as its not skewed. The ACF plot shows strong correlation especially for a log of 4 quarters. This is expected as it is a seasonal quarterly time series data. The p-value of 0 < 0.05 significance level, indicates a significant autocorrelation in the residuals. This means the residuals are not white noise. Based on this , we can conclude that there are systematic patterns in the data that the SNAIVE model isn’t capturing. The forecasts from this model may not be as reliable as desired.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)

# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))

# Look at the residuals
fit |> gg_tsresiduals()

# Look at some forecasts
fit |> forecast() |> autoplot(recent_production)

# test box_pierce
 augment( fit ) |> features(.innov, box_pierce, lag = 10)
## # A tibble: 1 × 3
##   .model       bp_stat bp_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    34.4  0.000160
 #test ljung_box
 augment( fit ) |> features(.innov, ljung_box, lag = 10)
## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    37.8 0.0000412

5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports series from global_economy

Answer

The time plot indicates a upward trend. However the seasonality is not evident. Hence its appropriate to use the NAIVE forecasting method. The histogram of the residuals displays a normal distribution and is not skewed. The ACF plot shows strong correlation especially for a lag of 1 year. the p values of Box-Pierce test and Ljung-Box test > 0.05 significance level, suggests no significant autocorrelation in residuals.This suggest the residuals ARE behaving like white noise. Based on this , we can conclude that the model is doing a reasonable job, the residuals appear to be independently distributed and the basic assumptions of the forecasting model are being met.

# Extract data of interest
recent_aus_economy <- global_economy |>
  filter(Country == "Australia") |>
  select( Exports) 


# Define and estimate a model
fit <- recent_aus_economy |> model(NAIVE(Exports))

# Look at the residuals
fit |> gg_tsresiduals()

# Look a some forecasts
fit |> forecast() |> autoplot(recent_aus_economy) +
  labs( title = "Forecast of Australia's export of goods and services ",  y = "Exports of goods and services (% of GDP)")

# test box_pierce
 augment( fit ) |> features(.innov, box_pierce, lag = 10)
## # A tibble: 1 × 3
##   .model         bp_stat bp_pvalue
##   <chr>            <dbl>     <dbl>
## 1 NAIVE(Exports)    14.6     0.148
 #test ljung_box
 augment( fit ) |> features(.innov, ljung_box, lag = 10)
## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 NAIVE(Exports)    16.4    0.0896

Bricks series from aus_production

Answer

The time series shows a upward trend with seasonality behavior. There are seasonal fluctuations that grow with the level of the series. The Seasonal Naive forecasting method may be appropriate in this case.

The residual results are not white noise, as the residuals seem to be centered around zero and follow a constant variance except for 1 peak which is the lowest. The histogram of the residuals is not a normal distribution, as its left skewed. The ACF plot shows strong correlation especially a cyclic behavior which repeats 10 years. The p-value of 0 < 0.05 significance level, indicates a significant autocorrelation in the residuals. This means the residuals are not white noise. Based on this we can conclude that seasonal naïve (SNAIVE) model is not capturing all the patterns in the data. We need to look into other models which can capture this information.

# Load the dataset
bricks <- aus_production |>
  drop_na(Bricks) |>
 select(Bricks)

# Plot the data
bricks |>
  autoplot(Bricks) +
  labs(title = "Quarterly production of Bricks in Australia", y = "Brick production in millions of bricks")

# Fit the training model
bricks_fit  <- bricks |>
  model(
    `Seasonal naïve` = SNAIVE(Bricks)
  
  )

# Look at the residuals
bricks_fit  |> gg_tsresiduals()

# Generate forecasts 
 bricks_fit  |> 
   forecast()|>  
   autoplot(bricks) +
  labs(
    y = "Brick production in millions of bricks",
    title = "Seasonal Naive Forecast for Quarterly production of Bricks in Australia"
  ) +
  scale_y_continuous(labels = scales::comma) +
  guides(colour = guide_legend(title = "Forecast"))

 # test box_pierce
 augment( bricks_fit ) |> features(.innov, box_pierce, lag = 10)
## # A tibble: 1 × 3
##   .model         bp_stat bp_pvalue
##   <chr>            <dbl>     <dbl>
## 1 Seasonal naïve    292.         0
 #test ljung_box
 augment( bricks_fit ) |> features(.innov, ljung_box, lag = 10)
## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 Seasonal naïve    301.         0

5.7 For your retail time series Create a training dataset consisting of observations before 2011

How sensitive are the accuracy measures to the amount of training data used?

Answer:

The ACF plot shows that there is some autocorrelation in the data. The residuals are not centered around 0 and seems to be right skewed.Hence its not normal distribution. They also do not have constant variation. The residuals are not uncorrelated.

The forecasted data does not account for the increase in the actual data. However, the actual data appears to fall mostly within the 80% confidence interval.

The errors are smaller on the training data versus the test data.

Accuracy measures are sensitive to the amount of training data used. With an increased amount of training data, there is better accuracy. However, too much training data will have negative effects. To solve this problem, it may be best to do a cross validation with various training sets and find the smallest RMSE computed.

set.seed(1234)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

# Create a training dataset consisting of observations before 2011 
myseries_train <- myseries |>
  filter(year(Month) < 2011)

# Check that your data have been split appropriately
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

# Fit a seasonal naïve model using SNAIVE()
fit <- myseries_train |>
  model(SNAIVE(Turnover))

# Check the residuals.
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Produce forecasts for the test data
fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)

# Compare the accuracy of your forecasts against the actual values.
fit |> accuracy()
## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Tasmania Cafes, … SNAIV… Trai…  1.33  2.90  2.22  6.31  10.7     1     1 0.800
fc |> accuracy(myseries)
## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Tasm… Cafes, … Test   7.12  9.13  7.58  13.2  14.4  3.42  3.15 0.863