library(tidyverse)
library(fpp3)
library(seasonal)
library(fable)
Time Series plot of Australian population
aus_pop <- global_economy %>% filter(Country == "Australia")
aus_pop %>% autoplot(Population) + labs(title = "Population of Australia")
Since there is a clear upward trend in the population of Australia, the forecasting method that I think is most appropriate is the drift method. This method is set to be the average change seen in the historical data.
# fit the model
aus_fit <- aus_pop %>% model(RW(Population ~ drift()))
# Generate forecasts for the next three years
aus_fc <- aus_fit |> forecast(h = 5)
# Plot forecast
aus_fc |>
autoplot(aus_pop, level = NULL, color = "red") +
labs(
y = "Population",
title = "Five Year Forecast for Australian Population"
)
Time Series plot of clay brick production in Australia
bricks <- aus_production %>% filter_index("1956 Q1" ~ "2009 Q4") %>%
select(Quarter, Bricks)
bricks %>% autoplot(Bricks) + labs(title = "Clay Brick Production in Australia")
Visualizing the bricks data, there are clear seasonal trends. The method most appropriate woudl be the seasonal naive model.
# fit the model
bricks_fit <- bricks %>% model(SNAIVE(Bricks ~ lag("year")))
# Generate forecasts for the next five years
bricks_fc <- bricks_fit %>% forecast(h = "5 years")
# Plot forecast
bricks_fc %>%
autoplot(bricks, color = "red", level = NULL) +
labs(
y = "Population",
title = "Five Year Forecast for Australian Clay Brick Production"
)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_line()`).
Time Series plot of NSW lambs in Australia
head(aus_livestock)
## # A tsibble: 6 x 4 [1M]
## # Key: Animal, State [1]
## Month Animal State Count
## <mth> <fct> <fct> <dbl>
## 1 1976 Jul Bulls, bullocks and steers Australian Capital Territory 2300
## 2 1976 Aug Bulls, bullocks and steers Australian Capital Territory 2100
## 3 1976 Sep Bulls, bullocks and steers Australian Capital Territory 2100
## 4 1976 Oct Bulls, bullocks and steers Australian Capital Territory 1900
## 5 1976 Nov Bulls, bullocks and steers Australian Capital Territory 2100
## 6 1976 Dec Bulls, bullocks and steers Australian Capital Territory 1800
nsw_lambs <- aus_livestock %>% filter(Animal == "Lambs", State == "New South Wales")
nsw_lambs %>% autoplot(Count) + labs(title = "Number of Lambs Slaughtered in New South Wales")
There appears ot be seasonal trends in the number of lambs slaughtered in New South Wales so the seasonal naive model is approriate
# fit the model
nsw_lambs_fit <- nsw_lambs %>% model(SNAIVE(Count ~ lag("year")))
# Generate forecasts for the next two years
nsw_lambs_fc <- nsw_lambs_fit %>% forecast(h = 24)
# Plot forecast
nsw_lambs_fc %>%
autoplot(nsw_lambs, color = "red", level = NULL) +
labs(
y = "Population",
title = "Two Year Forecast for the Count of Lambs Slaughtered in New South Wales"
)
usa_hh_wealth <- hh_budget %>% filter(Country == "USA") %>% select(Wealth)
usa_hh_wealth %>% autoplot(Wealth) + labs(title = "Wealth Trend in the USA")
Check for autocorrelation of lags to determine if this is close to a random walk. If low autocorreltion and random walk is likely.
usa_hh_wealth %>% gg_lag(period = "year", lags = 1:5, geom = "point")
## Plot variable not specified, automatically selected `y = Wealth`
usa_hh_wealth %>% ACF(Wealth, lag_max = 25) %>% autoplot()
The low auto-corraltions potentially suggest a random walk but to allow more flexibility for the predicted values to increase or decrease over time, I choose to forecast the wealth indicator using a drift model vs. a naive model.
# fit the model
usa_hh_wealth_fit <- usa_hh_wealth %>% model(RW(Wealth ~ drift()))
# Generate forecasts for the next five years
usa_hh_wealth_fc <- usa_hh_wealth_fit %>% forecast(h = 5)
# Plot forecast
usa_hh_wealth_fc %>%
autoplot(usa_hh_wealth, color = "red", level = NULL) +
labs(
y = "Population",
title = "Five Year Forecast of the Wealth Indicator in the USA: Drift Model"
)
aus_food_to <- aus_retail %>% filter(Industry == "Takeaway food services", State == "Australian Capital Territory")
aus_food_to %>% autoplot(Turnover) + labs(title = "Takeaway Food Services in the Autralian Capital Territory", y = "AUD (Millions)")
aus_food_to %>% gg_lag(Turnover, period = "month", lags = 1:12, geom = "point")
aus_food_to %>% ACF(Turnover, lag_max = 36) %>% autoplot()
aus_food_to %>% gg_season(Turnover)
As there appears to be a clear upward trend the best model would be to use the drift model.
# fit the model
aus_food_to_fit <- aus_food_to %>% model(RW(Turnover ~ drift()))
# Generate forecasts for the next 3 years
aus_food_to_fc <- aus_food_to_fit %>% forecast(h = 36)
# Plot forecast
aus_food_to_fc %>%
autoplot(aus_food_to, color = "red", level = NULL) +
labs(
y = "AUD (MIllions)",
title = "Three Year Forecast of Food Take Away Services in the Australian Capital Territory \n Drift Model"
)
fb <- gafa_stock %>% filter(Symbol == "FB") %>%
mutate(day = row_number()) %>%
update_tsibble(index = day, regular = TRUE)
head(fb)
## # A tsibble: 6 x 9 [1]
## # Key: Symbol [1]
## Symbol Date Open High Low Close Adj_Close Volume day
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 FB 2014-01-02 54.8 55.2 54.2 54.7 54.7 43195500 1
## 2 FB 2014-01-03 55.0 55.7 54.5 54.6 54.6 38246200 2
## 3 FB 2014-01-06 54.4 57.3 54.0 57.2 57.2 68852600 3
## 4 FB 2014-01-07 57.7 58.5 57.2 57.9 57.9 77207400 4
## 5 FB 2014-01-08 57.6 58.4 57.2 58.2 58.2 56682400 5
## 6 FB 2014-01-09 58.7 59.0 56.7 57.2 57.2 92253300 6
fb %>% autoplot(Close) + labs(title = "Facebook (Meta Inc.) Closing Stock Prices", y = "USD")
# fit the model
fb_fit <- fb %>% model(RW(Close ~ drift()))
# Generate forecasts for the next 1 year
fb_fc <- fb_fit %>% forecast(h = 200)
# Plot forecast
fb_fc %>%
autoplot(fb, color = "red", level = NULL) +
labs(
y = "USD",
title = "Facebook Closing Price Forecast: Drift Model"
)
Creating a line through the first and last value of the time series (blue dashed line) plot shows that the forecast line is identical to this. The line demonstrates the average change in seen in the historical data, which is how this method forecasts future values.
# Plot forecast
fb_fc %>%
autoplot(fb, color = "red", linewidth = 1, level = NULL) +
labs(
y = "USD",
title = "Facebook Closing Price Forecast: Drift Model"
) +
#geom_abline(slope = 0.0608, intercept = 54.71, color = "blue", linetype = 2) +
geom_segment(aes(x = 1, y = 54.71, xend = 1258, yend = 131.09), color = "blue", linewidth = 1, linetype = 2)
The video lecture in 5.2 mentioned the Efficient Market Hypothesis and that if you are dealing with financial data and the market is efficient the Naive Model may be the best benchmark model to start with. The Naive model takes the last observed data as the forecast.
# fit the model
fb_fit <- fb %>% model(NAIVE(Close))
# Generate forecasts for the next 1 year
fb_fc <- fb_fit %>% forecast(h = 200)
# Plot forecast
fb_fc %>%
autoplot(fb, color = "red", level = NULL) +
labs(
y = "USD",
title = "Facebook Closing Price Forecast: Naive Model"
)
Comparing this to the drift model, I think that using the Naive model does work better for forecasting stock prices. The Naive method follows the assumption that the value tomorrow is equal to its value today. When dealing with stock prices this is the most simple reliable way of predicting future stock prices.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
The importance of analyzing the residuals is to get an idea of how reliable the accuracy of the model is. When evaluating the residuals we are check that they meet certain criteria. 1. The residuals are uncorrelated 2. The residuals have a mean of 0 Other properties include… 3. The residuals are normally distributed 4. The residuals demonstrate constant variability - (homoscedasticity) If the residuals do not represent white noise, than we can also assume that the models not capturing all of the data. When visualizing the residuals from forecasting the beer production in Australia we can see that there is constant variance around zero, the distribution appear to be normally distributed, and there is low autocorrelation. In the autocorrelation plot we can see the highest correlation at lag 4 passing the threshold of significance, indicating the seasonality in the data.
Australian Exports series from global_economy
# Extract data of interest
aus_exports <- global_economy %>%
filter(Country == "Australia")
I decided to use the Naive model as there is only yearly data, making the seasonal naive model not appropriate.
# Visualize time series
aus_exports %>% autoplot(Exports) + labs(title = "Australian Exports", y = "% GDP")
# Define and estimate a model
aus_exports_fit <- aus_exports |> model(NAIVE(Exports))
# Look at the residuals
aus_exports_fit |> gg_tsresiduals()
# Look a some forecasts
aus_exports_fit |> forecast(h = 3) |> autoplot(aus_exports) + labs(title = "Australian Export Forecasts: Naive Mdoel", y = "% GDP")
Aside from the first lag, the residuals are not autocorrelated. Although the residual correlation of the first demonstrates significance, the correlation coefficient is still pretty low so potentially can be excused. The innovation residual show homoscedasticity as there is constant variance around 0 ranging from approximately - 2.5 to 2.5. The distribution of the residuals also demonstrate they are normally distributed.
# Visualize time series
bricks %>% autoplot(Bricks) + labs(title = "Australian Clay Brick Production", y = "Millions")
# Define and estimate a model
bricks_fit <- bricks %>% model(NAIVE(Bricks))
# Look at the residuals
bricks_fit %>% gg_tsresiduals()
# Look a some forecasts
bricks_fit %>% forecast(h = 12) |> autoplot(bricks) + labs(title = "Australian Clay Brick Production: Seasonal Naive Mdoel", y = "Millions")
Using a seasonal naive model to forecast the number of clay bricks that Australia produces, the variance around 0 in the residual plot shows more fluctuations from 1975 - 1990, demonstrating irregularity and violating the assumption that there is homoscedasticity. There are several highly correlated residual lags. The high negative correlations every 4th lag starting 2, are the correlations of between quarters - 2 quarters apart (Q2/Q4 and Q1/Q3). The high positive correlation of residuals every 4th lag starting at the lag 4 show that a quarters residual is correlated with the value of the same quarter from the previous year. The distribution of the residuals also seem to be slightly right skewed.
Since the residual plot shows a consistent pattern and the residuals are autocorrelated, both showing strong seasonal patterns, the residuals do not appear to be white noise and an alternative model is suggested when forecasting the Australian clay brick production.
set.seed(123)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries %>% autoplot(Turnover) + labs(title = "Victoria Household and Goods Retail Turnover")
myseries_train <- myseries |>
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
fit <- myseries_train |>
model(SNAIVE(Turnover))
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
fit |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Househo… SNAIV… Trai… 25.1 45.6 35.4 5.05 7.29 1 1 0.695
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Vict… Househo… Test 172. 213. 174. 15.1 15.2 4.90 4.68 0.947
Increasing the amount of the training data should generally improve the accuracy of the predictions by allowing a model to learn more complex patterns and seasonal variations within the data. This is especially true when dealing with longer horizon periods. In the code below I added 5 more years to the training data and re fit the model, forecasted and calculated the accuracy. Visually inspecting the differences in accuracy measures from the previous forcast, all metrics improved.
# Create new training set to add more years
myseries_train <- myseries |>
filter(year(Month) < 2016)
# Fit a new model
fit <- myseries_train |>
model(SNAIVE(Turnover))
# Forecast the remaining periods
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
# Generate plot
fc |> autoplot(myseries)
# Assess accuracy
fit |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Househo… SNAIV… Trai… 28.3 49.8 38.4 4.94 6.99 1 1 0.718
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Vict… Househo… Test 79.4 92.2 79.4 6.63 6.63 2.07 1.85 0.688