Homework2

3.1

Consider the GDP information in global_economy. Plot the GDP per capita for each country over time. Which country has the highest GDP per capita? How has this changed over time?

library(ggplot2)
library(fpp3)

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.3     ✔ fable       0.4.1

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(dplyr)
library(tsibble)
library(feasts)
library(fable)
library(fabletools)
library(seasonal)

## 
## Attaching package: 'seasonal'

## The following object is masked from 'package:tibble':
## 
##     view

# GDP per capita 
data("global_economy", package = "fpp3")

## Warning in data("global_economy", package = "fpp3"): data set 'global_economy'
## not found

global_economy <- global_economy |> 
  mutate(GDP_per_capita = GDP / Population)

# GDP per capita over time for each country
ggplot(global_economy, aes(x = Year, y = GDP_per_capita, group = Country, color = Country)) +
  geom_line(alpha = 0.6) +  
  labs(title = "GDP Per Capita Over Time by Country", y = "GDP Per Capita (USD)") +
  theme_minimal() +
  theme(legend.position = "none")

## Warning: Removed 3242 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Find the country with the highest GDP per capita each year
highest_gdp_country <- global_economy |>
  index_by(Year) |>
  filter(GDP_per_capita == max(GDP_per_capita, na.rm = TRUE)) |>
  select(Year, Country, GDP_per_capita)

# Identify the country that appears most frequently
top_country <- highest_gdp_country |>
  count(Country, sort = TRUE) |>
  slice_max(n, n = 1) |>
  pull(Country)

# GDP per capita for the most frequent top country
global_economy |>
  filter(Country == top_country) |>
  autoplot(GDP_per_capita) +
  labs(title = paste("GDP per capita for", top_country),
       y = "$US") +
  theme_minimal()

## Warning: There were 2 warnings in `filter()`.
## The first warning was:
## ℹ In argument: `Country == top_country`.
## Caused by warning in `==.default`:
## ! longer object length is not a multiple of shorter object length
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_line()`).

3.2

For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect.

United States GDP from global_economy. Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock. Victorian Electricity Demand from vic_elec. Gas production from aus_production.

# US GDP
us_gdp <- global_economy |>
  filter(Country == "United States")

us_gdp |>
  autoplot(log(GDP)) +
  labs(title = "Log of US GDP Over Time", y = "Log($US)")

# Victorian Livestock Slaughter
vic_livestock <- aus_livestock |>
  filter(Animal == "Bulls, bullocks and steers", State == "Victoria")

vic_livestock |>
  mutate(Smoothed = slider::slide_mean(Count, before = 11, complete = TRUE)) |>
  autoplot(Smoothed) +
  labs(title = "12-Month Moving Average of Victorian Livestock Slaughter", y = "Slaughter Count")

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Victorian Electricity Demand
vic_elec |>
  mutate(Diff_Demand = difference(Demand)) |>
  autoplot(Diff_Demand) +
  labs(title = "First Difference of Victorian Electricity Demand", y = "Change in Demand (MWh)")

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# Gas Production
aus_production |>
  autoplot(log(Gas)) +
  labs(title = "Log of Gas Production Over Time", y = "Log(Cubic Meters)")

3.3

Why is a Box-Cox transformation unhelpful for the canadian_gas data?

A Box-Cox transformation is unhelpful because the variance is already stable and it does not remove seasonality.

Box-Cox Transformation

lambda <- canadian_gas |> 
  features(Volume, features = guerrero) |> 
  pull(lambda_guerrero)

canadian_gas |> 
  mutate(Transformed = box_cox(Volume, lambda)) |> 
  autoplot(Transformed) +
  labs(title = "Box-Cox Transformed Canadian Gas Production", y = "Transformed Volume")

Seasonal Differencing

canadian_gas |> 
  mutate(Differenced = difference(Volume, 12)) |> 
  autoplot(Differenced) +
  labs(title = "Seasonally Differenced Canadian Gas Production", y = "Differenced Volume")

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

3.4

What Box-Cox transformation would you select for your retail data (from Exercise 7 in Section 2.10)? The lambda value foundwas approximately -0.057, which is close to 0. When λ ≈ 0, the Box-Cox transformation suggests a log transformation (log(x)) as the best choice. This transformation helps reduce heteroscedasticity and makes the data more normally distributed.

# Load necessary libraries
library(fpp3)
library(dplyr)
library(ggplot2)
library(tidyr)

aus_retail <- aus_retail %>%
  filter(Turnover > 0)

# Compute optimal lambda 
lambda_values <- aus_retail %>%
  group_by(State, Industry) %>%
  summarise(lambda_guerrero = guerrero(Turnover), .groups = "drop") %>%
  distinct(State, Industry, lambda_guerrero)  # Ensure uniqueness

aus_retail <- aus_retail %>%
  as_tibble() %>%
  left_join(lambda_values, by = c("State", "Industry"))

# Apply Box-Cox transformation based on lambda
aus_retail <- aus_retail %>%
  mutate(Transformed_Turnover = case_when(
    lambda_guerrero > 0.5 ~ Turnover,  # No transformation
    lambda_guerrero > 0 & lambda_guerrero <= 0.5 ~ Turnover^lambda_guerrero,  # Power transformation
    abs(lambda_guerrero) < 0.1 ~ log(Turnover),  # Log transformation
    lambda_guerrero < -0.1 ~ 1 / (Turnover^abs(lambda_guerrero))  # Inverse transformation
  ))

# Convert back to tsibble
aus_retail <- aus_retail %>%
  as_tsibble(index = Month, key = c(State, Industry))

# Plot: Original vs Transformed Turnover
ggplot(aus_retail, aes(x = Month)) +
  geom_line(aes(y = Turnover, color = "Original"), alpha = 0.6) +
  geom_line(aes(y = Transformed_Turnover, color = "Box-Cox Transformed"), alpha = 0.8) +
  facet_wrap(~Industry, scales = "free_y") +  # Multiple panels for each industry
  labs(title = "Comparison of Turnover Before and After Box-Cox Transformation",
       y = "Turnover ($AUD)",
       x = "Month") +
  theme_minimal() +
  scale_color_manual(values = c("Original" = "steelblue", "Box-Cox Transformed" = "red"))

3.5

For the following series, find an appropriate Box-Cox transformation in order to stabilise the variance. Tobacco from aus_production, Economy class passengers between Melbourne and Sydney from ansett, and Pedestrian counts at Southern Cross Station from pedestrian.

aus_production <- aus_production %>% filter(Tobacco > 0)
ansett <- ansett %>% filter(Class == "Economy", Passengers > 0)
pedestrian <- pedestrian %>% filter(Count > 0)

# Compute the optimal lambda values
lambda_tobacco <- aus_production %>% features(Tobacco, guerrero) %>% pull(lambda_guerrero) %>% first()
lambda_passengers <- ansett %>% features(Passengers, guerrero) %>% pull(lambda_guerrero) %>% first()
lambda_pedestrian <- pedestrian %>% features(Count, guerrero) %>% pull(lambda_guerrero) %>% first()

# Apply Box-Cox transformation using the extracted lambda values
aus_production <- aus_production %>%
  mutate(Transformed_Tobacco = box_cox(Tobacco, lambda_tobacco))

ansett <- ansett %>%
  mutate(Transformed_Passengers = box_cox(Passengers, lambda_passengers))

pedestrian <- pedestrian %>%
  mutate(Transformed_Count = box_cox(Count, lambda_pedestrian))

# Plot transformed vs. original series

# Tobacco Production
ggplot(aus_production, aes(x = Quarter)) +
  geom_line(aes(y = Tobacco, color = "Original"), alpha = 0.6) +
  geom_line(aes(y = Transformed_Tobacco, color = "Box-Cox Transformed"), alpha = 0.8) +
  labs(title = "Comparison of Tobacco Production Before and After Box-Cox Transformation",
       y = "Tobacco Production",
       x = "Quarter") +
  theme_minimal() +
  scale_color_manual(values = c("Original" = "steelblue", "Box-Cox Transformed" = "red"))

# Economy Class Passengers
colnames(ansett)

## [1] "Week"                   "Airports"               "Class"                 
## [4] "Passengers"             "Transformed_Passengers"

ggplot(ansett, aes(x = Week)) +  # Using correct time variable
  geom_line(aes(y = Passengers, color = "Original"), alpha = 0.6) +
  geom_line(aes(y = Transformed_Passengers, color = "Box-Cox Transformed"), alpha = 0.8) +
  labs(title = "Comparison of Economy Class Passengers Before and After Box-Cox Transformation",
       y = "Passengers",
       x = "Week") +
  theme_minimal() +
  scale_color_manual(values = c("Original" = "steelblue", "Box-Cox Transformed" = "red"))

# Pedestrian Counts
ggplot(pedestrian, aes(x = Date_Time)) +
  geom_line(aes(y = Count, color = "Original"), alpha = 0.6) +
  geom_line(aes(y = Transformed_Count, color = "Box-Cox Transformed"), alpha = 0.8) +
  labs(title = "Comparison of Pedestrian Counts Before and After Box-Cox Transformation",
       y = "Pedestrian Count",
       x = "Time") +
  theme_minimal() +
  scale_color_manual(values = c("Original" = "steelblue", "Box-Cox Transformed" = "red"))

3.7

Consider the last five years of the Gas data from aus_production.

gas <- tail(aus_production, 5*4) |> select(Gas) Plot the time series. Can you identify seasonal fluctuations and/or a trend-cycle? Use classical_decomposition with type=multiplicative to calculate the trend-cycle and seasonal indices. Do the results support the graphical interpretation from part a? Compute and plot the seasonally adjusted data. Change one observation to be an outlier (e.g., add 300 to one observation), and recompute the seasonally adjusted data. What is the effect of the outlier? Does it make any difference if the outlier is near the end rather than in the middle of the time series?

# Extract the last 5 years (5 * 4 quarters) of Gas data
gas <- tail(aus_production, 5 * 4) |> select(Gas)

# Plot original gas production
autoplot(gas) + 
  labs(title = "Gas Production (Last 5 Years)", y = "Cubic Meters") +
  theme_minimal()

## Plot variable not specified, automatically selected `.vars = Gas`

# Classical decomposition (Multiplicative)
gas_decomp <- gas %>%
  model(decomp = classical_decomposition(Gas, type = "multiplicative")) %>%
  components()

# Ensure valid decomposition output by filtering NA values
gas_decomp <- gas_decomp %>%
  filter(!is.na(trend) & !is.na(irregular))  # Fix: Use irregular instead of remainder

## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `!is.na(trend) & !is.na(irregular)`.
## Caused by warning in `is.na()`:
## ! is.na() applied to non-(list or vector) of type 'closure'

# Plot decomposition results
autoplot(gas_decomp) +
  labs(title = "Classical Decomposition of Gas Production") +
  theme_minimal()

# Compute seasonally adjusted gas production
gas_adj <- gas_decomp |> 
  mutate(Seasonally_Adjusted = Gas / seasonal)  # Multiplicative decomposition

# Plot seasonally adjusted gas production
autoplot(gas_adj, Seasonally_Adjusted) +
  labs(title = "Seasonally Adjusted Gas Production", y = "Cubic Meters") +
  theme_minimal()

### **Introduce an Outlier in the Middle**
gas_outlier <- gas
gas_outlier$Gas[10] <- gas_outlier$Gas[10] + 300  # Modify the 10th observation

gas_outlier_decomp <- gas_outlier |> 
  model(classical_decomposition(Gas, type = "multiplicative")) |> 
  components()

# Classical decomposition with outlier
gas_outlier_decomp <- gas_outlier_decomp |> 
  filter(!is.na(trend) & !is.na(irregular)) |>  
  mutate(Seasonally_Adjusted = Gas / seasonal)

## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `!is.na(trend) & !is.na(irregular)`.
## Caused by warning in `is.na()`:
## ! is.na() applied to non-(list or vector) of type 'closure'

# Plot seasonally adjusted gas with outlier in the middle
autoplot(gas_outlier_decomp, Seasonally_Adjusted) +
  labs(title = "Seasonally Adjusted Gas Production (With Outlier in Middle)",
       y = "Cubic Meters") +
  theme_minimal()

### **Introduce an Outlier at the End**
gas_outlier_end <- gas
gas_outlier_end$Gas[nrow(gas_outlier_end)] <- gas_outlier_end$Gas[nrow(gas_outlier_end)] + 300  # Last observation

# Classical decomposition with outlier at the end
gas_outlier_end_decomp <- gas_outlier_end |> 
  model(classical_decomposition(Gas, type = "multiplicative")) |> 
  components()

gas_outlier_end_decomp <- gas_outlier_end_decomp |> 
  filter(!is.na(trend) & !is.na(irregular)) |>  
  mutate(Seasonally_Adjusted = Gas / seasonal)

## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `!is.na(trend) & !is.na(irregular)`.
## Caused by warning in `is.na()`:
## ! is.na() applied to non-(list or vector) of type 'closure'

# Plot seasonally adjusted gas with outlier at the end
autoplot(gas_outlier_end_decomp, Seasonally_Adjusted) +
  labs(title = "Seasonally Adjusted Gas Production (With Outlier at End)",
       y = "Cubic Meters") +
  theme_minimal()

3.8

Recall your retail time series data (from Exercise 7 in Section 2.10). Decompose the series using X-11. Does it reveal any outliers, or unusual features that you had not noticed previously?

aus_retail <- as_tsibble(aus_retail, key = c(State, Industry), index = Month)

aus_retail <- aus_retail |> mutate(Turnover = as.numeric(Turnover))

aus_retail <- aus_retail |> filter(!is.na(Turnover))

retail_subset <- aus_retail |> 
  filter(Industry == "Cafes, restaurants and takeaway food services")

# X-11 decomposition 
x11_dcmp <- retail_subset |> 
  model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) |> 
  components()

# Plot decomposition results
autoplot(x11_dcmp) +
  labs(title = "X-11 Decomposition of Retail Turnover", 
       y = "Turnover ($AUD)") +
  theme_minimal()

x11_dcmp |> 
  ggplot(aes(x = Month)) +
  geom_line(aes(y = Turnover, colour = "Data")) +
  geom_line(aes(y = season_adjust, colour = "Seasonally Adjusted")) +
  geom_line(aes(y = trend, colour = "Trend")) +
  labs(y = "Turnover ($AUD)",
       title = "Seasonally Adjusted vs Trend in Retail Turnover") +
  scale_colour_manual(
    values = c("gray", "#0072B2", "#D55E00"),
    breaks = c("Data", "Seasonally Adjusted", "Trend")
  ) +
  theme_minimal()

3.9

Write about 3–5 sentences describing the results of the decomposition. Pay particular attention to the scales of the graphs in making your interpretation. Is the recession of 1991/1992 visible in the estimated components?

Trend: Steady increase in the labor force, with a noticeable dip around 1991-1992, reflecting the early 1990s recession.
Seasonality: Regular annual fluctuations, showing stable seasonal hiring trends over time.
Remainder: Significant deviations around 1991-1992, indicating unexpected employment shocks during the recession.

The recession is visible in the trend and irregular components but did not disrupt seasonal patterns.

Homework2

Amish Rasheed

2025-02-16

3.1

3.2

3.3

3.4

3.5

3.7

3.8

3.9