Homework3

library(tidyverse)
library(fpp3)
library(zoo)
library(imputeTS)

5.1

Australian Population (global_economy)

population <- global_economy %>%
  filter(Country == "Australia") %>%
  select(Year, Population)

autoplot(population, Population) + ggtitle("Australian Population (global_economy)")

The population exhibits a long-term increasing trend. Therefore, the best method for forecasting would be the drift method.

# Set training data from 1960 to 2005
train <- population |>
  filter_index("1960" ~ "2005")

# Fit the models
population_fit <- train |>
  model(
    Drift = NAIVE(Population ~ drift())
  )

# Generate forecasts for the next 12 years
population_fc <- population_fit |>
  forecast(h = 12)

# Extract actual  data from 2005 onward
real_data <- population |>
  select(Year, Population) |>
  na.omit() |>  # Ensure no missing values
  filter_index("2005" ~ .)

# Plot the forecast vs. actual data
population_fc |>
  autoplot(train, level = NULL, colour = "red") +  # Plot forecast with training data
  autolayer(real_data, Population, colour = "black") +  # Ensure real data is plotted
  labs(
    y = "Population",
    title = "Forecast vs. Actual Population"
  ) +
  guides(colour = guide_legend(title = "Forecast Method")) +
  theme_minimal()

Bricks (aus_production)

aus_production <- na.omit(aus_production)

autoplot(aus_production, Bricks) + ggtitle("Bricks (aus_production)")

The brick production exhibits a strong seasonal pattern due to construction cycles. Therefore, the best method for forecasting would be the Seasonal naïve method.

# Set training data from 1980 Q1 to 2000 Q4
train2 <- aus_production |>
  select(Quarter, Bricks) |>
  na.omit() |>  # Remove missing values
  filter_index("1980 Q1" ~ "2000 Q4")

# Fit the models
Bricks_fit <- train2 |>
  model(
    `Seasonal naïve` = SNAIVE(Bricks)
  )

# Generate forecasts for the next 14 quarters (2001 Q1 onward)
Bricks_fc <- Bricks_fit |>
  forecast(h = 14)

# Extract actual  data from 2001 Q1 onward
real_data2 <- aus_production |>
  select(Quarter, Bricks) |>
  na.omit() |>  # Ensure no missing values
  filter_index("2000 Q4" ~ .)

# Plot the forecast vs. actual data
Bricks_fc |>
  autoplot(train2, level = NULL, colour = "red") +  # Plot forecast with training data
  autolayer(real_data2, colour = "black") +  # Ensure real data is plotted
  labs(
    y = "Millions of Bricks",
    title = "Forecast vs. Actual Bricks Production"
  ) +
  guides(colour = guide_legend(title = "Forecast Method")) +
  theme_minimal()

NSW Lambs (aus_livestock)

livestock <- aus_livestock %>%
  filter(State == "New South Wales", Animal == "Lambs") %>%
  select(Month, Count)

autoplot(livestock, Count) + ggtitle("NSW Lambs (aus_livestock)")

The lamb production exhibits a strong seasonal pattern due to breeding cycles. Therefore, the best method for forecasting would be the Seasonal naïve method.

# Set training data from January 2000 to December 2015 
train3 <- livestock |>
  select(Month, Count) |>
  na.omit() |>  # Remove missing values
  filter_index("2000/01" ~ "2015/12")

# Fit the models
lambs_fit <- train3 |>
  model(
    `Seasonal naïve` = SNAIVE(Count)
  )

# Generate forecasts for the next 36 months
lambs_fc <- lambs_fit |>
  forecast(h = 36)

# Extract actual data from January 2016 onward
real_data3 <- livestock |>
  select(Month, Count) |>
  na.omit() |>  # Ensure no missing values
  filter_index("2016/01" ~ .)

# Plot the forecast vs. actual data
lambs_fc |>
  autoplot(train3, level = NULL, colour = "red") +  # Plot forecast with training data
  autolayer(real_data3, colour = "black") +  # Ensure real data is plotted
  labs(
    y = "Count",
    title = "Forecast vs. Actual Lambs Production"
  ) +
  theme_minimal()

Household wealth (hh_budget).

autoplot(hh_budget, Wealth) + ggtitle("Household wealth (hh_budget)")

The Household Wealth exhibits a long-term increasing trend. Therefore, the best method for forecasting would be the drift method.

# Set training data from 2005 to 2013
train4 <- hh_budget |>
  select(Country, Year, Wealth) |>
  filter_index("2005" ~ "2013")

# Fit the models
Wealth_fit <- train4 |>
  model(
    Drift = NAIVE(Wealth ~ drift())
  )

# Generate forecasts for the next 3 years
Wealth_fc <- Wealth_fit |>
  forecast(h = 3)

# Extract actual data from 2010 onward
real_data4 <- hh_budget |>
  select(Country, Year, Wealth) |>
  na.omit() |>  # Ensure no missing values
  filter_index("2010" ~ .)

# Plot the forecast vs. actual data
Wealth_fc |>
  autoplot(train4, level = NULL, colour = "red") +  # Plot forecast with training data
  autolayer(real_data4, Wealth, colour = "black") +  # Ensure real data is plotted
  labs(
    y = "Wealth",
    title = "Forecast vs. Actual Household wealth"
  ) +
  guides(colour = guide_legend(title = "Forecast Method")) +
  theme_minimal()

Australian takeaway food turnover (aus_retail).

retail <- aus_retail %>%
  filter(Industry == "Cafes, restaurants and takeaway food services", State == "Australian Capital Territory") %>%
  select(Month, Turnover)

autoplot(retail, Turnover) + ggtitle("Australian Capital Territory takeaway food turnover (aus_retail)") + theme(legend.position = "bottom")

The amount of turnover in millions exhibits a strong seasonal pattern. Therefore, the best method for forecasting would be the Seasonal naïve method.

# Set training data from January 2000 to December 2015 
train5 <- retail |>
  select(Month, Turnover) |>
  na.omit() |>  # Remove missing values
  filter_index("2006/01" ~ "2015/12")

# Fit the models
food_fit <- train5 |>
  model(
    `Seasonal naïve` = SNAIVE(Turnover)
  )

# Generate forecasts for the next 36 months
food_fc <- food_fit |>
  forecast(h = 36)

# Extract actual data from January 2016 onward
real_data5 <- retail |>
  select(Month, Turnover) |>
  na.omit() |>  # Ensure no missing values
  filter_index("2016/01" ~ .)

# Plot the forecast vs. actual data
food_fc |>
  autoplot(train5, level = NULL, colour = "red") +  # Plot forecast with training data
  autolayer(real_data5, colour = "black") +  # Ensure real data is plotted
  labs(
    y = "Millions",
    title = "Forecast vs. Actual Turnover in Million AUD"
  ) +
  theme_minimal()

5.2

# Filter for Facebook stock prices
facebook_data <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  select(Date, Close)

# Step 1: Create a full sequence of daily dates (regular time series)
full_dates <- tibble(Date = seq(min(facebook_data$Date), max(facebook_data$Date), by = "1 day"))

# Step 2: Merge Facebook data with full sequence of dates
facebook_data <- full_dates %>%
  left_join(facebook_data, by = "Date") %>%
  as_tsibble(index = Date)  # Convert to tsibble

# Step 3: Fill missing stock prices (e.g., weekends, holidays)
facebook_data <- facebook_data %>%
  mutate(Close = na_interpolation(Close))  # Use interpolation to fill missing values

# Convert to a regular time series
facebook_data <- facebook_data %>%
  tsibble::update_tsibble(regular = TRUE)

# Plot the cleaned time series
autoplot(facebook_data, Close) +
  ggtitle("Facebook Stock Price Over Time (Regular Time Series)") +
  xlab("Year") + ylab("Stock Price (USD)")

# Apply the Drift method for forecasting
fb_drift <- facebook_data %>%
  model(RW(Close ~ drift())) %>%
  forecast(h = 100)

# Plot forecasts
autoplot(facebook_data, Close) +
  autolayer(fb_drift, level = NULL, series = "Drift Forecast") +
  ggtitle("Facebook Stock Price Forecast using Drift Method") +
  xlab("Year") + ylab("Stock Price (USD)")

# Compute the first and last values of Close
first_value <- first(facebook_data$Close)
last_value <- last(facebook_data$Close)
total_rows <- nrow(facebook_data)  # Use nrow() instead of n()

# Ensure there are at least two rows to avoid division by zero
slope <- (last_value - first_value) / (total_rows - 1)

# Add DriftLine column
facebook_data <- facebook_data %>%
  mutate(DriftLine = first_value + (row_number() - 1) * slope)

# Plot the data with the drift line
autoplot(facebook_data, Close) + 
  autolayer(fb_drift, level = NULL, series = "Drift Forecast") +
  geom_line(aes(x = Date, y = DriftLine), color = "red", linetype = "dashed") +
  ggtitle("Facebook Stock Price with Drift Line") +
  xlab("Year") + ylab("Stock Price (USD)")

# Compare with other benchmark methods
fb_forecasts <- facebook_data %>%
  model(
    Naive = NAIVE(Close),
    Mean = MEAN(Close),
    Drift = RW(Close ~ drift())
  ) %>%
  forecast(h = 100)

# Plot all forecasts
autoplot(facebook_data, Close) +
  autolayer(fb_forecasts, level = NULL) +
  ggtitle("Comparison of Benchmark Forecasting Methods") +
  xlab("Year") + ylab("Stock Price (USD)")

The most effective approach is to employ the drift method, as it is highly improbable that the stock price will remain constant in the future or that it will simply represent the average of historical values.

5.3

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)

# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))

# Look at the residuals
fit |> gg_tsresiduals()

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

The plot of residuals over time indicates that they do not appear random, suggesting a possible seasonality. Additionally, the autocorrelation function plot shows that some values fall outside the blue dashed lines, indicating significant correlations at certain lags, which points to seasonal dependencies. The histogram of the residuals is skewed, and there is a large number of residuals. Therefore, the residuals do not resemble white noise.

5.4

Australian Exports series from global_economy

# Extract Australian Exports series
australia_exports <- global_economy |> 
  filter(Country == "Australia") |> 
  select(Year, Exports)

glimpse(australia_exports)

## Rows: 58
## Columns: 2
## $ Year    <dbl> 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 19…
## $ Exports <dbl> 12.99445, 12.40310, 13.94301, 13.00589, 14.93825, 13.22018, 12…

Since the data is annual (Year as the time unit), there is no seasonality. Therefore it is best to use NAIVE() method.

# Fit Naïve model for Exports
fit_exports <- australia_exports |> model(NAIVE(Exports))

# Residual analysis for Exports
fit_exports |> gg_tsresiduals()

# Forecast Exports for the next 10 years
fc_exports <- fit_exports |> forecast(h = 10)

# Plot Exports forecast
fc_exports |> autoplot(australia_exports) +
  ggtitle("Naïve Forecast for Australian Exports") +
  xlab("Year") + ylab("Exports (Billion USD)")

In the residual plots, the top residual plot appears random. The lags in the ACF plot mostly fall within the blue dashed lines. Additionally, the histogram of the residuals shows a normal distribution. These show that the residuals look like white noise.

Bricks series from aus_production

# Extract Bricks production series
bricks_data <- aus_production |> 
  select(Quarter, Bricks)

Since the data is quarterly, it is likely to have a seasonal pattern. Use SNAIVE() (to account for seasonal fluctuations).

# Fit Seasonal Naïve model for Bricks
fit_bricks <- bricks_data |> model(SNAIVE(Bricks))

# Residual analysis for Bricks
fit_bricks |> gg_tsresiduals()

# Forecast Bricks for the next 8 quarters (2 years)
fc_bricks <- fit_bricks |> forecast(h = "2 years")

# Plot Bricks forecast
fc_bricks |> autoplot(bricks_data) +
  ggtitle("Seasonal Naïve Forecast for Brick Production") +
  xlab("Year") + ylab("Brick Production (Millions)")

The top residual plot exhibits a noticeable pattern, suggesting the presence of seasonality. Additionally, the ACF plot reveals multiple lags that extend beyond the blue dashed lines, displaying a distinct pattern as well. Furthermore, the histogram of the residuals shows a skewed distribution. All of these factors indicate that the residuals do not resemble white noise.

5.7

set.seed(1225)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries |>
  filter(year(Month) < 2011)

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

fit <- myseries_train |>
  model(`Seasonal naïve` = SNAIVE(Turnover))

fit |> gg_tsresiduals()

The residuals do not appear to be uncorrelated or normally distributed. The ACF plot shows multiple lags that fall outside the blue dashed lines, suggesting a pattern in the residuals. Additionally, many of the residuals are skewed.

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

fit |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 New Sou… Takeawa… Seaso… Trai…  11.5  26.1  19.2  4.81  9.59     1     1 0.890

fc |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Seasonal… New … Takeawa… Test   48.6  96.8  79.5  7.67  16.3  4.14  3.71 0.964

Accuracy measures improve with more data, but they require high-quality, well-prepared data that does not contain new patterns.