This R Markdown answers decomposition exercises from: https://otexts.com/fpp3/decomposition-exercises.html
Exercises covered: 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9.
Note on tsibble vs tibble usage:
I keep data as tsibble for time-series tasks (plotting,
lagging, modeling, decomposition).
I convert briefly to tibble only for cross-sectional
ranking in Exercise 3.1 (group_by(Year),
slice_max() across countries), because Year is
the tsibble index and tsibble does not allow grouping directly on index
variables.
Consider GDP in global_economy. Plot GDP per capita over
time. Which country has highest GDP per capita? How has this changed
over time?
# Remove World Bank aggregate groups so only sovereign-country series remain.
wb_aggregate_codes <- c(
"ARB","CEB","CSS","EAP","EAS","ECA","ECS","EMU","EUU","FCS","HIC","HPC",
"IBD","IBT","IDA","IDB","IDX","LAC","LCN","LDC","LIC","LMC","LMY","LTE",
"MEA","MIC","MNA","NAC","OED","OSS","PRE","PST","SAS","SSA","SSF","SST",
"TEA","TEC","TLA","TMN","TSA","TSS","UMC","WLD"
)
gdp_pc <- global_economy |>
filter(!is.na(Code), !Code %in% wb_aggregate_codes) |>
mutate(GDP_per_capita = GDP / Population) |>
filter(!is.na(GDP_per_capita))
year_counts <- gdp_pc |>
as_tibble() |>
group_by(Year) |>
summarise(n_countries = n_distinct(Country), .groups = "drop")
latest_year <- year_counts |>
filter(n_countries >= 10) |>
summarise(latest = max(Year, na.rm = TRUE)) |>
pull(latest)
top10_countries <- gdp_pc |>
as_tibble() |>
filter(Year == latest_year) |>
slice_max(order_by = GDP_per_capita, n = 10, with_ties = FALSE) |>
pull(Country)
best_overall <- gdp_pc |>
slice_max(order_by = GDP_per_capita, n = 1, with_ties = FALSE) |>
select(Country, Year, GDP_per_capita)
plot_countries <- union(top10_countries, best_overall$Country)
gdp_pc_top <- gdp_pc |>
filter(Country %in% plot_countries)
country_pop_ref <- gdp_pc |>
as_tibble() |>
filter(Country %in% plot_countries) |>
group_by(Country) |>
filter(Year == max(Year, na.rm = TRUE)) |>
slice_tail(n = 1) |>
ungroup() |>
select(Country, Population)
pop_cut <- median(country_pop_ref$Population, na.rm = TRUE)
country_style <- country_pop_ref |>
arrange(desc(Population), Country) |>
mutate(
pop_group = if_else(Population >= pop_cut, "Large population", "Lower population"),
small_rank = cumsum(pop_group == "Lower population"),
line_style = case_when(
pop_group == "Large population" ~ "solid",
small_rank %% 2 == 1 ~ "dashed",
TRUE ~ "dotted"
)
) |>
select(Country, pop_group, line_style)
gdp_pc_top <- gdp_pc_top |>
left_join(country_style, by = "Country")
# Second chart subset: industrialized nations (OECD members) with
# latest observed population >= 10 million.
oecd_codes <- c(
"AUS","AUT","BEL","CAN","CHL","COL","CRI","CZE","DNK","EST","FIN","FRA",
"DEU","GRC","HUN","ISL","IRL","ISR","ITA","JPN","KOR","LVA","LTU","LUX",
"MEX","NLD","NZL","NOR","POL","PRT","SVK","SVN","ESP","SWE","CHE","TUR",
"GBR","USA"
)
country_latest_pop <- gdp_pc |>
as_tibble() |>
group_by(Country, Code) |>
filter(Year == max(Year, na.rm = TRUE)) |>
slice_tail(n = 1) |>
ungroup() |>
select(Country, Code, Population)
industrialized_10m_countries <- country_latest_pop |>
filter(Code %in% oecd_codes, Population >= 1e7) |>
pull(Country)
gdp_pc_industrialized_10m <- gdp_pc |>
filter(Country %in% industrialized_10m_countries)
ind_top_pop_ref <- gdp_pc_industrialized_10m |>
as_tibble() |>
group_by(Country) |>
filter(Year == max(Year, na.rm = TRUE)) |>
slice_tail(n = 1) |>
ungroup() |>
select(Country, Population)
ind_pop_cut <- median(ind_top_pop_ref$Population, na.rm = TRUE)
ind_style <- ind_top_pop_ref |>
arrange(desc(Population), Country) |>
mutate(
pop_group = if_else(Population >= ind_pop_cut, "Large population", "Lower population"),
small_rank = cumsum(pop_group == "Lower population"),
line_style = case_when(
pop_group == "Large population" ~ "solid",
small_rank %% 2 == 1 ~ "dashed",
TRUE ~ "dotted"
)
) |>
select(Country, line_style)
gdp_pc_industrialized_10m_all <- gdp_pc_industrialized_10m |>
left_join(ind_style, by = "Country")
best_overall
## # A tsibble: 1 x 3 [1Y]
## # Key: Country [1]
## Country Year GDP_per_capita
## <fct> <dbl> <dbl>
## 1 Monaco 2014 185153.
gdp_pc_top |>
distinct(Country, pop_group, line_style) |>
arrange(desc(pop_group), Country)
## # A tibble: 11 × 3
## Country pop_group line_style
## <fct> <chr> <chr>
## 1 Iceland Lower population dotted
## 2 Luxembourg Lower population dashed
## 3 Macao SAR, China Lower population dotted
## 4 Monaco Lower population dashed
## 5 Qatar Lower population dashed
## 6 Denmark Large population solid
## 7 Ireland Large population solid
## 8 Norway Large population solid
## 9 Singapore Large population solid
## 10 Switzerland Large population solid
## 11 United States Large population solid
gdp_pc_top |>
ggplot(aes(x = Year, y = GDP_per_capita, colour = Country, linetype = line_style)) +
geom_line(linewidth = 0.8, alpha = 0.95) +
scale_linetype_manual(
values = c("solid" = "solid", "dashed" = "dashed", "dotted" = "dotted"),
breaks = c("solid", "dashed", "dotted"),
labels = c("Larger population", "Lower population (dashed)", "Lower population (dotted)")
) +
labs(
title = "GDP per capita over time (top countries)",
subtitle = "Solid lines are larger-population countries; lower-population countries use dashed/dotted lines",
y = "GDP per capita",
x = "Year"
) +
theme(
panel.background = element_rect(fill = "white", colour = NA),
plot.background = element_rect(fill = "white", colour = NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.spacing.x = unit(0.08, "cm"),
legend.spacing.y = unit(0.02, "cm"),
legend.key.height = unit(0.28, "cm"),
legend.key.width = unit(0.7, "cm")
)
gdp_pc_industrialized_10m_all |>
ggplot(aes(x = Year, y = GDP_per_capita, colour = Country, linetype = line_style)) +
geom_line(linewidth = 0.75, alpha = 0.95) +
scale_linetype_manual(
values = c("solid" = "solid", "dashed" = "dashed", "dotted" = "dotted"),
breaks = c("solid", "dashed", "dotted"),
labels = c("Larger population", "Lower population (dashed)", "Lower population (dotted)")
) +
labs(
title = "GDP per capita: industrialized countries (population >= 10M)",
subtitle = "Solid lines are larger-population countries; OECD member proxy; includes all qualifying countries (including US)",
y = "GDP per capita",
x = "Year"
) +
theme(
panel.background = element_rect(fill = "white", colour = NA),
plot.background = element_rect(fill = "white", colour = NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.spacing.x = unit(0.08, "cm"),
legend.spacing.y = unit(0.02, "cm"),
legend.key.height = unit(0.28, "cm"),
legend.key.width = unit(0.7, "cm")
)
Answer: The single highest country-year value is Monaco in 2014. This exercise now excludes regional/aggregate entries (World Bank aggregate codes), so the chart contains only sovereign-country series. To keep the chart readable, I use the top countries in the latest year with enough observations and include the overall maximum country if needed. I also encode population size in line type: larger-population countries are solid, lower-population countries are dashed/dotted.
For each series, graph the data. If transforming is appropriate, transform and describe the effect.
us_gdp <- global_economy |> filter(Country == "United States") |> select(Year, GDP)
bulls_vic <- aus_livestock |> filter(Animal == "Bulls, bullocks and steers", State == "Victoria") |> select(Month, Count)
vic_demand <- vic_elec |> select(Time, Demand)
gas_prod <- aus_production |> select(Quarter, Gas)
lambda_us_gdp <- us_gdp |> features(GDP, guerrero) |> pull(lambda_guerrero)
lambda_bulls <- bulls_vic |> features(Count, guerrero) |> pull(lambda_guerrero)
lambda_vic <- vic_demand |> features(Demand, guerrero) |> pull(lambda_guerrero)
lambda_gas <- gas_prod |> features(Gas, guerrero) |> pull(lambda_guerrero)
tibble(
series = c("US GDP", "Vic bulls slaughter", "Vic electricity demand", "Gas production"),
lambda = c(lambda_us_gdp, lambda_bulls, lambda_vic, lambda_gas)
)
## # A tibble: 4 × 2
## series lambda
## <chr> <dbl>
## 1 US GDP 0.282
## 2 Vic bulls slaughter -0.0446
## 3 Vic electricity demand 0.0999
## 4 Gas production 0.110
us_gdp |> autoplot(GDP) + labs(title = "US GDP (original)")
us_gdp |> autoplot(box_cox(GDP, lambda_us_gdp)) + labs(title = sprintf("US GDP (Box-Cox, lambda = %.2f)", lambda_us_gdp))
bulls_vic |> autoplot(Count) + labs(title = "Victorian bulls slaughter (original)")
bulls_vic |> autoplot(box_cox(Count, lambda_bulls)) + labs(title = sprintf("Victorian bulls slaughter (Box-Cox, lambda = %.2f)", lambda_bulls))
vic_demand |> autoplot(Demand) + labs(title = "Victorian electricity demand (original)")
vic_demand |> autoplot(box_cox(Demand, lambda_vic)) + labs(title = sprintf("Victorian electricity demand (Box-Cox, lambda = %.2f)", lambda_vic))
gas_prod |> autoplot(Gas) + labs(title = "Gas production (original)")
gas_prod |> autoplot(box_cox(Gas, lambda_gas)) + labs(title = sprintf("Gas production (Box-Cox, lambda = %.2f)", lambda_gas))
Answer: Box-Cox is most useful where variance increases with level (especially GDP and gas). It compresses large values and makes trend/seasonal structure easier to compare over time.
Why is Box-Cox unhelpful for canadian_gas?
Answer: The main issue is changing seasonal pattern over time, not just variance-level relationship. Box-Cox rescales variance but does not fix evolving seasonal shape.
What Box-Cox transformation would you select for your retail data (Exercise 7 in Section 2.10)?
set.seed(67)
myseries <- aus_retail |> filter(`Series ID` == sample(unique(aus_retail$`Series ID`), 1))
lambda_retail <- myseries |> features(Turnover, guerrero) |> pull(lambda_guerrero)
myseries |> distinct(`Series ID`)
## # A tibble: 1 × 1
## `Series ID`
## <chr>
## 1 A3349350R
lambda_retail
## [1] 0.03442716
myseries |> autoplot(Turnover) + labs(title = "Selected retail series (original)")
myseries |> autoplot(box_cox(Turnover, lambda_retail)) + labs(title = sprintf("Selected retail series (Box-Cox, lambda = %.2f)", lambda_retail))
Answer: I use lambda = 0.034 (Guerrero estimate) because it is data-driven and targets variance stabilization.
Find an appropriate Box-Cox transformation for: - Tobacco from
aus_production - Economy class MEL-SYD passengers from
ansett - Southern Cross Station pedestrian counts from
pedestrian
tobacco <- aus_production |> select(Quarter, Tobacco)
ansett_economy <- ansett |> filter(Airports == "MEL-SYD", Class == "Economy") |> select(Week, Passengers)
ped_sc <- pedestrian |> filter(Sensor == "Southern Cross Station") |> select(Date_Time, Count)
lambda_tobacco <- tobacco |> features(Tobacco, guerrero) |> pull(lambda_guerrero)
lambda_ansett <- ansett_economy |> features(Passengers, guerrero) |> pull(lambda_guerrero)
lambda_ped <- ped_sc |> features(Count, guerrero) |> pull(lambda_guerrero)
tibble(
series = c("Tobacco", "Ansett Economy MEL-SYD", "Pedestrian Southern Cross"),
lambda = c(lambda_tobacco, lambda_ansett, lambda_ped)
)
## # A tibble: 3 × 2
## series lambda
## <chr> <dbl>
## 1 Tobacco 0.926
## 2 Ansett Economy MEL-SYD 2.00
## 3 Pedestrian Southern Cross -0.250
tobacco |> autoplot(Tobacco) + labs(title = "Tobacco production (original)")
tobacco |> autoplot(box_cox(Tobacco, lambda_tobacco)) + labs(title = sprintf("Tobacco production (Box-Cox, lambda = %.2f)", lambda_tobacco))
ansett_economy |> autoplot(Passengers) + labs(title = "Ansett Economy MEL-SYD (original)")
ansett_economy |> autoplot(box_cox(Passengers, lambda_ansett)) + labs(title = sprintf("Ansett Economy MEL-SYD (Box-Cox, lambda = %.2f)", lambda_ansett))
ped_sc |> autoplot(Count) + labs(title = "Southern Cross pedestrians (original)")
ped_sc |> autoplot(box_cox(Count, lambda_ped)) + labs(title = sprintf("Southern Cross pedestrians (Box-Cox, lambda = %.2f)", lambda_ped))
Answer: Suggested lambdas are Tobacco 0.926, Ansett 2, and Pedestrian -0.25.
Show that a \(3\times5\) MA is equivalent to a 7-term weighted moving average with weights 0.067, 0.133, 0.200, 0.200, 0.200, 0.133, and 0.067.
w3 <- rep(1/3, 3)
w5 <- rep(1/5, 5)
w35 <- as.numeric(stats::convolve(w3, rev(w5), type = "open"))
tibble(position = -3:3, exact_weight = w35, rounded_3dp = round(w35, 3))
## # A tibble: 7 × 3
## position exact_weight rounded_3dp
## <int> <dbl> <dbl>
## 1 -3 0.0667 0.067
## 2 -2 0.133 0.133
## 3 -1 0.2 0.2
## 4 0 0.2 0.2
## 5 1 0.2 0.2
## 6 2 0.133 0.133
## 7 3 0.0667 0.067
Answer: The combined weights are the convolution \((1/3,1/3,1/3) * (1/5,1/5,1/5,1/5,1/5) = \frac{1}{15}(1,2,3,3,3,2,1)\), which rounds to \((0.067, 0.133, 0.200, 0.200, 0.200, 0.133, 0.067)\).
Use the last five years of gas data from aus_production
and apply classical multiplicative decomposition.
gas_last5 <- tail(aus_production, 20) |> select(Quarter, Gas)
gas_last5 |> autoplot(Gas) + labs(title = "Last 5 years of gas production", x = "Quarter", y = "Gas")
gas_classical <- gas_last5 |>
model(classical_decomposition(Gas, type = "multiplicative")) |>
components()
gas_classical |> autoplot() + labs(title = "Classical multiplicative decomposition: gas (last 5 years)")
gas_classical |> autoplot(season_adjust) + labs(title = "Seasonally adjusted gas (baseline)", y = "Seasonally adjusted Gas")
mid_idx <- round(nrow(gas_last5) / 2)
end_idx <- nrow(gas_last5) - 1
gas_mid_out <- gas_last5 |> mutate(Gas = if_else(row_number() == mid_idx, Gas + 300, Gas))
gas_end_out <- gas_last5 |> mutate(Gas = if_else(row_number() == end_idx, Gas + 300, Gas))
sa_mid <- gas_mid_out |> model(classical_decomposition(Gas, type = "multiplicative")) |> components() |> select(Quarter, sa_mid = season_adjust)
sa_end <- gas_end_out |> model(classical_decomposition(Gas, type = "multiplicative")) |> components() |> select(Quarter, sa_end = season_adjust)
sa_base <- gas_classical |> select(Quarter, sa_base = season_adjust)
autoplot(sa_base, sa_base, colour = "#1b9e77", linewidth = 0.8) +
autolayer(sa_mid, sa_mid, colour = "#d95f02", linewidth = 0.8) +
autolayer(sa_end, sa_end, colour = "#7570b3", linewidth = 0.8) +
labs(
title = "Seasonally adjusted gas under outlier scenarios",
subtitle = "Green: Baseline | Orange: Outlier in middle | Purple: Outlier near end",
x = "Quarter",
y = "Seasonally adjusted Gas"
)
Answer: Outliers distort the seasonally adjusted series. Center outliers usually contaminate more points (two-sided moving averages), while end outliers create more localized edge effects.
Decompose the retail series from Exercise 7 (Section 2.10) using X-11. Identify outliers or unusual features.
x13_available <- requireNamespace("seasonal", quietly = TRUE)
if (x13_available) {
myseries_bc <- myseries |> mutate(Turnover_bc = box_cox(Turnover, lambda_retail))
retail_x11 <- myseries_bc |> model(x11 = X_13ARIMA_SEATS(Turnover_bc ~ x11()))
x11_comp <- retail_x11 |> components()
x11_comp |> autoplot() + labs(title = "X-11 decomposition of selected retail series")
x11_comp |>
mutate(outlier_flag = abs(irregular - median(irregular, na.rm = TRUE)) > 3 * IQR(irregular, na.rm = TRUE)) |>
filter(outlier_flag)
} else {
message("X-11 requires the 'seasonal' package (and X-13 binaries). Install with install.packages('seasonal').")
}
## # A dable: 14 x 10 [1M]
## # Key: State, Industry, .model [1]
## # : Turnover_bc = trend + seasonal + irregular
## State Industry .model Month Turnover_bc trend seasonal irregular
## <chr> <chr> <chr> <mth> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Other retailing x11 1983 Mar 5.39 5.35 -0.0181 0.0627
## 2 Victoria Other retailing x11 1983 Apr 5.21 5.35 -0.0626 -0.0750
## 3 Victoria Other retailing x11 1985 Feb 5.49 5.50 -0.0660 0.0599
## 4 Victoria Other retailing x11 1988 Sep 5.72 5.81 -0.0316 -0.0632
## 5 Victoria Other retailing x11 1988 Dec 6.36 5.84 0.440 0.0783
## 6 Victoria Other retailing x11 1989 Nov 6.01 5.99 0.0770 -0.0642
## 7 Victoria Other retailing x11 1989 Dec 6.52 6.01 0.428 0.0820
## 8 Victoria Other retailing x11 1990 Jan 6.06 6.01 -0.0139 0.0618
## 9 Victoria Other retailing x11 1991 Jul 5.92 5.90 -0.0575 0.0722
## 10 Victoria Other retailing x11 1995 Nov 6.44 6.41 0.0910 -0.0622
## 11 Victoria Other retailing x11 1996 Sep 6.30 6.40 -0.0419 -0.0610
## 12 Victoria Other retailing x11 2000 Jun 6.64 6.63 -0.0697 0.0768
## 13 Victoria Other retailing x11 2011 Dec 8.12 7.60 0.445 0.0690
## 14 Victoria Other retailing x11 2012 Dec 7.91 7.45 0.393 0.0704
## # ℹ 2 more variables: season_adjust <dbl>, outlier_flag <lgl>
Answer: X-11 isolates trend/seasonal/irregular movement. Large spikes in irregular are candidate outliers and are easier to diagnose in component form.
Figures 3.19 and 3.20 (Australian civilian labour force decomposition): 1. Describe the decomposition in 3-5 sentences, with attention to scales. 2. Is the 1991/1992 recession visible?
labour_candidates <- us_employment |>
mutate(title_lower = tolower(Title)) |>
filter(grepl("civilian labor force|civilian labour force", title_lower))
if (nrow(labour_candidates) == 0) {
labour_candidates <- us_employment |> filter(Title == "Total Private")
}
labour_title <- labour_candidates |> count(Title, sort = TRUE) |> slice(1) |> pull(Title)
labour_series <- labour_candidates |>
filter(Title == labour_title) |>
select(Month, Employed)
labour_series |>
autoplot(Employed) +
labs(title = paste("Labour series used:", labour_title), x = "Month", y = "Employed")
labour_decomp <- labour_series |>
model(STL(Employed ~ trend(window = 21) + season(window = "periodic"))) |>
components()
labour_decomp |>
autoplot() +
labs(title = "STL decomposition of labour series")
labour_decomp |>
autoplot(remainder) +
labs(title = "Remainder component", x = "Month", y = "Remainder")
labour_remainder_acf <- labour_decomp |>
ACF(remainder, lag_max = 36)
labour_remainder_acf |>
autoplot() +
labs(title = "ACF of remainder (diagnostic)")
labour_lb <- labour_decomp |>
features(remainder, ljung_box, lag = 24, dof = 0)
labour_lb
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 "STL(Employed ~ trend(window = 21) + season(window = \"peri… 2542. 0
Answer: The trend dominates scale, seasonal movement is smaller, and irregular captures shocks. The early-1990s recession is visible as weaker growth and negative remainder movement. If remainder ACF has significant spikes and Ljung-Box p-values are small, the remainder is not white noise, meaning some systematic structure remains (e.g., evolving seasonality, cycle effects, or structural change).