library(tsibbledata)
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ ggplot2 3.5.1
## ✔ dplyr 1.1.4 ✔ tsibble 1.1.6
## ✔ tidyr 1.3.1 ✔ feasts 0.4.2
## ✔ lubridate 1.9.4 ✔ fable 0.4.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
global_economy
gdp_pc <- global_economy %>%
mutate(GDP_per_capita = GDP / Population)
gdp_pc
gdp_pc %>%
ggplot(aes(x = Year, y = GDP_per_capita, group = Country, colour = Country)) +
geom_line(show.legend = FALSE) +
labs(title = "GDP per capita for each country over time",
y = "GDP per capita (USD)",
x = "Year")
## Warning: Removed 3242 rows containing missing values or values outside the scale range
## (`geom_line()`).
gdp_pc %>%
filter(Year == max(Year)) %>%
arrange(desc(GDP_per_capita)) %>%
slice(1)
us_gdp <- global_economy |>
filter(Country == "United States") |>
mutate(GDP_per_capita = GDP / Population)
# Original GDP
us_gdp |>
autoplot(GDP) +
labs(title = "United States GDP (Original)", y = "US$ (Billions)")
# GDP per capita (population adjustment)
us_gdp |>
autoplot(GDP_per_capita) +
labs(title = "United States GDP per Capita", y = "US$ per person")
# GDP inflation adjustment
us_gdp <- global_economy |>
filter(Country == "United States") |>
mutate(GDP_real = GDP / (CPI / 100)) # CPI base = 100
autoplot(us_gdp, GDP_real) +
labs(title = "US Real GDP (Inflation adjusted)", y = "US$ (Billions, constant prices)")
us_gdp <- us_gdp |>
mutate(GDP_per_capita_real = GDP_real / Population)
autoplot(us_gdp, GDP_per_capita_real) +
labs(title = "US Real GDP per Capita", y = "US$ (constant prices)")
bulls <- aus_livestock |>
filter(Animal == "Bulls, bullocks and steers", State == "Victoria")
# Original
bulls |>
autoplot(Count) +
labs(title = "Victorian Slaughter of Bulls, Bullocks & Steers (Original)", y = "Count")
# Log transform
bulls |>
autoplot(log(Count)) +
labs(title = "Victorian Slaughter of Bulls, Bullocks & Steers (Log scale)", y = "log(Count)")
# Original electricity demand
vic_elec |>
autoplot(Demand) +
labs(title = "Victorian Electricity Demand (Original)", y = "MW")
# Calendar adjustment: average demand by day of week
vic_elec |>
index_by(Day = date(Time)) |>
summarise(Daily_Demand = sum(Demand)) |>
mutate(Weekday = wday(Day, label = TRUE)) |>
ggplot(aes(x = Day, y = Daily_Demand, colour = Weekday)) +
geom_line() +
labs(title = "Victorian Electricity Demand (Calendar Adjustment)",
y = "MW", x = "Day")
gas <- aus_production |>
select(Gas)
# Original
gas |>
autoplot(Gas) +
labs(title = "Australian Gas Production (Original)", y = "Petajoules")
# Log transform
gas |>
autoplot(log(Gas)) +
labs(title = "Australian Gas Production (Log scale)", y = "log(Petajoules)")
library(fpp3)
canadian_gas
canadian_gas %>%
autoplot(Volume) +
labs(title = "Canadian Gas Production", y = "Monthly Volume")
canadian_gas %>%
features(Volume, features = guerrero)
lambda <- canadian_gas |>
features(Volume, features = guerrero) |>
pull(lambda_guerrero)
canadian_gas |>
autoplot(box_cox(Volume, lambda)) +
labs(title = paste("Canadian Gas Production (Box-Cox, lambda =", round(lambda,3), ")"))
A Box–Cox transformation is unhelpful for the Canadian gas production
series because the variance is already stable with seasonal fluctuations
remaining about the same size across time. lambda=0.58 suggests a mild
power transform, but after I applied it to the data it produced an
almost identical plot - showing that there is no additional benefits of
using box-cox here.
set.seed(2705)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
# Estimate the Box–Cox lambda using Guerrero
lam <- myseries |>
features(Turnover, features = guerrero) |>
pull(lambda_guerrero)
lam
## [1] 0.4807535
Since lambda is between 0 and 1, a mild Box–Cox power transform with lambda = 0.4807535 will stabilize the variance - if even only slightly. Since it’s not exactly 0, log transform may not be the best.
myseries_boxcox <- myseries |>
mutate(Turnover_boxcox = box_cox(Turnover, lambda = 0.4807535))
myseries_boxcox |>
pivot_longer(cols = c(Turnover, Turnover_boxcox),
names_to = "Series", values_to = "Value") |>
ggplot(aes(x = Month, y = Value)) +
geom_line() +
facet_wrap(~ Series, scales = "free_y", ncol = 1) +
labs(title = "Retail Turnover: Original vs Box-Cox Transformed (lambda = 0.481)",
y = "Turnover", x = "Month")
With this transformation, the overall trend remains - but it is a bit smoother (less changing variance).
# Tobacco production
tobacco <- aus_production %>% select(Tobacco)
# lambda
tobacco %>%
features(Tobacco, features = guerrero)
# transformation
tobacco_boxcox <- tobacco |>
mutate(Tobacco_boxcox = box_cox(Tobacco, lambda = 0.926))
# Original vs Box-Cox
tobacco_boxcox |>
pivot_longer(cols = c(Tobacco, Tobacco_boxcox),
names_to = "Series", values_to = "Value") |>
ggplot(aes(x = Quarter, y = Value)) +
geom_line() +
facet_wrap(~ Series, scales = "free_y", ncol = 1) +
labs(title = "Tobacco Production: Original vs Box-Cox Transformed (lambda = 0.926)",
y = "Production")
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).
For Tobacco from aus_production, we got lambda=0.93, which is very close to 1. This shows that the variance is already stable. The Box–Cox transformation only changed the scale slightly, so the original plot was already appropriate without major adjustment.
ansett %>%
filter(Airports == "MEL-SYD", Class == "Economy") %>%
features(Passengers, features = guerrero)
# transformation
ansett_boxcox <- ansett |>
mutate(Passengers_boxcox = box_cox(Passengers, lambda = 2))
# Original vs Box-Cox
ansett_boxcox |>
pivot_longer(cols = c(Passengers, Passengers_boxcox),
names_to = "Series", values_to = "Value") |>
ggplot(aes(x = Week, y = Value)) +
geom_line() +
facet_wrap(~ Series, scales = "free_y", ncol = 1) +
labs(title = "Economy Class MEL-SYD: Original vs Box-Cox Transformed (lambda = 2)",
y = "Passengers")
For the Melbourne–Sydney economy passengers, we have lambda = 2. This
suggests that transforming this data will only exaggerate the
differences (by squaring the data) rather than stabilizing them. We
should just use the original plot without transformation in this
case.
pedestrian %>%
filter(Sensor == "Southern Cross Station") %>%
features(Count, features = guerrero)
# lambda = -0.25 -> lambda = 0
ped_sc_boxcox <- pedestrian |>
mutate(Count_log = log(Count))
# Original vs Box-Cox
ped_sc_boxcox |>
pivot_longer(cols = c(Count, Count_log),
names_to = "Series", values_to = "Value") |>
ggplot(aes(x = Date_Time, y = Value)) +
geom_line() +
facet_wrap(~ Series, scales = "free_y", ncol = 1) +
labs(title = "Southern Cross Pedestrians: Original vs Box-Cox Transformed (lambda = 0)",
y = "Pedestrian Counts")
For the Southern Cross pedestrians, lambda = -0.25. There is unstable
variance with extreme peaks, and a strong transformation is needed to
stabilize it. Since lambda is close to 0, a power transformation less
than 0 helps stabilize variance - so a log-like transformation is
appropriate here.
gas <- tail(aus_production, 5*4) |> select(Gas)
gas
gas |> autoplot(Gas) +
labs(title = "Australian Gas Production (last 5 years)",
y = "Petajoules")
In this plot, I can see very clear seasonality (quarterly cycles) and a
very slight upward trend.
gas_dc <- gas |>
model(classical_decomposition(Gas, type = "multiplicative")) |>
components()
gas_dc
autoplot(gas_dc)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
In 3.7a, I mentioned clear seasonality (quarterly cycles) and a very slight upward trend. So yes, the trend component from part b’s decomposition confirms this slight gradual increase. The seasonal component also confirms the clear quarterly seasonality, with higher values around Q2–Q3 and lower values in Q1–Q4, matching the peaks and dips in part a’s plot.
Since the remainder is small, we can conclude that most of the variation was cause by trend and seasonality.
# Multiplicative classical decomposition
gas_dc <- gas |>
model(classical_decomposition(Gas, type = "multiplicative")) |>
components()
# original vs seasonally adjusted
gas_dc |>
ggplot(aes(x = Quarter)) +
geom_line(aes(y = Gas), colour = "grey") +
geom_line(aes(y = season_adjust), colour = "blue") +
labs(title = "Australian Gas Production (Seasonally Adjusted)",
y = "Petajoules", x = "Quarter")
gas_outlier <- gas
gas_outlier$Gas[10] <- gas_outlier$Gas[10] + 300 # adding 300 to one observation
gas_dc_outlier <- gas_outlier |>
model(classical_decomposition(Gas, type = "multiplicative")) |>
components()
gas_dc_outlier |>
ggplot(aes(x = Quarter)) +
geom_line(aes(y = Gas), colour = "grey") +
geom_line(aes(y = season_adjust), colour = "red") +
labs(title = "Australian Gas Production (With Outlier, Seasonally Adjusted)",
y = "Petajoules", x = "Quarter")
gas_mid <- gas
gas_end <- gas
# middle outlier
gas_mid$Gas[10] <- gas_mid$Gas[10] + 300
# end outlier
gas_end$Gas[nrow(gas_end)] <- gas_end$Gas[nrow(gas_end)] + 300
gas_dc_mid <- gas_mid |>
model(classical_decomposition(Gas, type = "multiplicative")) |>
components()
gas_dc_end <- gas_end |>
model(classical_decomposition(Gas, type = "multiplicative")) |>
components()
# outlier in the middle
p_mid <- gas_dc_mid |>
ggplot(aes(x = Quarter)) +
geom_line(aes(y = Gas), colour = "grey") +
geom_line(aes(y = season_adjust), colour = "red") +
labs(title = "Australian Gas Production (Outlier in Middle, Seasonally Adjusted)",
y = "Petajoules", x = "Quarter")
# outlier at the end
p_end <- gas_dc_end |>
ggplot(aes(x = Quarter)) +
geom_line(aes(y = Gas), colour = "grey") +
geom_line(aes(y = season_adjust), colour = "blue") +
labs(title = "Australian Gas Production (Outlier at End, Seasonally Adjusted)",
y = "Petajoules", x = "Quarter")
p_mid
p_end
set.seed(2705)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
myseries %>%
model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) %>%
components() %>%
autoplot()
There was a steady upward trend in retail turnover over time showing significant growth around the early 2000s.
With regards to the seasonality, the repeating monthly patterns are very consistent across the entire period but the magnitude/impact does seem to lessen overtime.
The irregular plot does highlight some very sharp spikes (especially around the mid to late 1980s), which may be due to outliers or unusual events that occurred.