This R Markdown answers decomposition exercises from: https://otexts.com/fpp3/decomposition-exercises.html

Exercises covered: 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9.

Note on tsibble vs tibble usage:
I keep data as tsibble for time-series tasks (plotting, lagging, modeling, decomposition).
I convert briefly to tibble only for cross-sectional ranking in Exercise 3.1 (group_by(Year), slice_max() across countries), because Year is the tsibble index and tsibble does not allow grouping directly on index variables.

1 Exercise 3.1

Consider GDP in global_economy. Plot GDP per capita over time. Which country has highest GDP per capita? How has this changed over time?

# Remove World Bank aggregate groups so only sovereign-country series remain.
wb_aggregate_codes <- c(
  "ARB","CEB","CSS","EAP","EAS","ECA","ECS","EMU","EUU","FCS","HIC","HPC",
  "IBD","IBT","IDA","IDB","IDX","LAC","LCN","LDC","LIC","LMC","LMY","LTE",
  "MEA","MIC","MNA","NAC","OED","OSS","PRE","PST","SAS","SSA","SSF","SST",
  "TEA","TEC","TLA","TMN","TSA","TSS","UMC","WLD"
)

gdp_pc <- global_economy |>
  filter(!is.na(Code), !Code %in% wb_aggregate_codes) |>
  mutate(GDP_per_capita = GDP / Population) |>
  filter(!is.na(GDP_per_capita))

year_counts <- gdp_pc |>
  as_tibble() |>
  group_by(Year) |>
  summarise(n_countries = n_distinct(Country), .groups = "drop")

latest_year <- year_counts |>
  filter(n_countries >= 10) |>
  summarise(latest = max(Year, na.rm = TRUE)) |>
  pull(latest)

top10_countries <- gdp_pc |>
  as_tibble() |>
  filter(Year == latest_year) |>
  slice_max(order_by = GDP_per_capita, n = 10, with_ties = FALSE) |>
  pull(Country)

best_overall <- gdp_pc |>
  slice_max(order_by = GDP_per_capita, n = 1, with_ties = FALSE) |>
  select(Country, Year, GDP_per_capita)

plot_countries <- union(top10_countries, best_overall$Country)

gdp_pc_top <- gdp_pc |>
  filter(Country %in% plot_countries)

country_pop_ref <- gdp_pc |>
  as_tibble() |>
  filter(Country %in% plot_countries) |>
  group_by(Country) |>
  filter(Year == max(Year, na.rm = TRUE)) |>
  slice_tail(n = 1) |>
  ungroup() |>
  select(Country, Population)

pop_cut <- median(country_pop_ref$Population, na.rm = TRUE)

country_style <- country_pop_ref |>
  arrange(desc(Population), Country) |>
  mutate(
    pop_group = if_else(Population >= pop_cut, "Large population", "Lower population"),
    small_rank = cumsum(pop_group == "Lower population"),
    line_style = case_when(
      pop_group == "Large population" ~ "solid",
      small_rank %% 2 == 1 ~ "dashed",
      TRUE ~ "dotted"
    )
  ) |>
  select(Country, pop_group, line_style)

gdp_pc_top <- gdp_pc_top |>
  left_join(country_style, by = "Country")

# Second chart subset: industrialized nations (OECD members) with
# latest observed population >= 10 million.
oecd_codes <- c(
  "AUS","AUT","BEL","CAN","CHL","COL","CRI","CZE","DNK","EST","FIN","FRA",
  "DEU","GRC","HUN","ISL","IRL","ISR","ITA","JPN","KOR","LVA","LTU","LUX",
  "MEX","NLD","NZL","NOR","POL","PRT","SVK","SVN","ESP","SWE","CHE","TUR",
  "GBR","USA"
)

country_latest_pop <- gdp_pc |>
  as_tibble() |>
  group_by(Country, Code) |>
  filter(Year == max(Year, na.rm = TRUE)) |>
  slice_tail(n = 1) |>
  ungroup() |>
  select(Country, Code, Population)

industrialized_10m_countries <- country_latest_pop |>
  filter(Code %in% oecd_codes, Population >= 1e7) |>
  pull(Country)

gdp_pc_industrialized_10m <- gdp_pc |>
  filter(Country %in% industrialized_10m_countries)

ind_top_pop_ref <- gdp_pc_industrialized_10m |>
  as_tibble() |>
  group_by(Country) |>
  filter(Year == max(Year, na.rm = TRUE)) |>
  slice_tail(n = 1) |>
  ungroup() |>
  select(Country, Population)

ind_pop_cut <- median(ind_top_pop_ref$Population, na.rm = TRUE)

ind_style <- ind_top_pop_ref |>
  arrange(desc(Population), Country) |>
  mutate(
    pop_group = if_else(Population >= ind_pop_cut, "Large population", "Lower population"),
    small_rank = cumsum(pop_group == "Lower population"),
    line_style = case_when(
      pop_group == "Large population" ~ "solid",
      small_rank %% 2 == 1 ~ "dashed",
      TRUE ~ "dotted"
    )
  ) |>
  select(Country, line_style)

gdp_pc_industrialized_10m_all <- gdp_pc_industrialized_10m |>
  left_join(ind_style, by = "Country")

best_overall
## # A tsibble: 1 x 3 [1Y]
## # Key:       Country [1]
##   Country  Year GDP_per_capita
##   <fct>   <dbl>          <dbl>
## 1 Monaco   2014        185153.
gdp_pc_top |>
  distinct(Country, pop_group, line_style) |>
  arrange(desc(pop_group), Country)
## # A tibble: 11 × 3
##    Country          pop_group        line_style
##    <fct>            <chr>            <chr>     
##  1 Iceland          Lower population dotted    
##  2 Luxembourg       Lower population dashed    
##  3 Macao SAR, China Lower population dotted    
##  4 Monaco           Lower population dashed    
##  5 Qatar            Lower population dashed    
##  6 Denmark          Large population solid     
##  7 Ireland          Large population solid     
##  8 Norway           Large population solid     
##  9 Singapore        Large population solid     
## 10 Switzerland      Large population solid     
## 11 United States    Large population solid
gdp_pc_top |>
  ggplot(aes(x = Year, y = GDP_per_capita, colour = Country, linetype = line_style)) +
  geom_line(linewidth = 0.8, alpha = 0.95) +
  scale_linetype_manual(
    values = c("solid" = "solid", "dashed" = "dashed", "dotted" = "dotted"),
    breaks = c("solid", "dashed", "dotted"),
    labels = c("Larger population", "Lower population (dashed)", "Lower population (dotted)")
  ) +
  labs(
    title = "GDP per capita over time (top countries)",
    subtitle = "Solid lines are larger-population countries; lower-population countries use dashed/dotted lines",
    y = "GDP per capita",
    x = "Year"
  ) +
  theme(
    panel.background = element_rect(fill = "white", colour = NA),
    plot.background = element_rect(fill = "white", colour = NA),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.spacing.x = unit(0.08, "cm"),
    legend.spacing.y = unit(0.02, "cm"),
    legend.key.height = unit(0.28, "cm"),
    legend.key.width = unit(0.7, "cm")
  )

gdp_pc_industrialized_10m_all |>
  ggplot(aes(x = Year, y = GDP_per_capita, colour = Country, linetype = line_style)) +
  geom_line(linewidth = 0.75, alpha = 0.95) +
  scale_linetype_manual(
    values = c("solid" = "solid", "dashed" = "dashed", "dotted" = "dotted"),
    breaks = c("solid", "dashed", "dotted"),
    labels = c("Larger population", "Lower population (dashed)", "Lower population (dotted)")
  ) +
  labs(
    title = "GDP per capita: industrialized countries (population >= 10M)",
    subtitle = "Solid lines are larger-population countries; OECD member proxy; includes all qualifying countries (including US)",
    y = "GDP per capita",
    x = "Year"
  ) +
  theme(
    panel.background = element_rect(fill = "white", colour = NA),
    plot.background = element_rect(fill = "white", colour = NA),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.spacing.x = unit(0.08, "cm"),
    legend.spacing.y = unit(0.02, "cm"),
    legend.key.height = unit(0.28, "cm"),
    legend.key.width = unit(0.7, "cm")
  )

Answer: The single highest country-year value is Monaco in 2014. This exercise now excludes regional/aggregate entries (World Bank aggregate codes), so the chart contains only sovereign-country series. To keep the chart readable, I use the top countries in the latest year with enough observations and include the overall maximum country if needed. I also encode population size in line type: larger-population countries are solid, lower-population countries are dashed/dotted.

2 Exercise 3.2

For each series, graph the data. If transforming is appropriate, transform and describe the effect.

us_gdp <- global_economy |> filter(Country == "United States") |> select(Year, GDP)
bulls_vic <- aus_livestock |> filter(Animal == "Bulls, bullocks and steers", State == "Victoria") |> select(Month, Count)
vic_demand <- vic_elec |> select(Time, Demand)
gas_prod <- aus_production |> select(Quarter, Gas)

lambda_us_gdp <- us_gdp |> features(GDP, guerrero) |> pull(lambda_guerrero)
lambda_bulls  <- bulls_vic |> features(Count, guerrero) |> pull(lambda_guerrero)
lambda_vic    <- vic_demand |> features(Demand, guerrero) |> pull(lambda_guerrero)
lambda_gas    <- gas_prod |> features(Gas, guerrero) |> pull(lambda_guerrero)

tibble(
  series = c("US GDP", "Vic bulls slaughter", "Vic electricity demand", "Gas production"),
  lambda = c(lambda_us_gdp, lambda_bulls, lambda_vic, lambda_gas)
)
## # A tibble: 4 × 2
##   series                  lambda
##   <chr>                    <dbl>
## 1 US GDP                  0.282 
## 2 Vic bulls slaughter    -0.0446
## 3 Vic electricity demand  0.0999
## 4 Gas production          0.110
us_gdp |> autoplot(GDP) + labs(title = "US GDP (original)")

us_gdp |> autoplot(box_cox(GDP, lambda_us_gdp)) + labs(title = sprintf("US GDP (Box-Cox, lambda = %.2f)", lambda_us_gdp))

bulls_vic |> autoplot(Count) + labs(title = "Victorian bulls slaughter (original)")

bulls_vic |> autoplot(box_cox(Count, lambda_bulls)) + labs(title = sprintf("Victorian bulls slaughter (Box-Cox, lambda = %.2f)", lambda_bulls))

vic_demand |> autoplot(Demand) + labs(title = "Victorian electricity demand (original)")

vic_demand |> autoplot(box_cox(Demand, lambda_vic)) + labs(title = sprintf("Victorian electricity demand (Box-Cox, lambda = %.2f)", lambda_vic))

gas_prod |> autoplot(Gas) + labs(title = "Gas production (original)")

gas_prod |> autoplot(box_cox(Gas, lambda_gas)) + labs(title = sprintf("Gas production (Box-Cox, lambda = %.2f)", lambda_gas))

Answer: Box-Cox is most useful where variance increases with level (especially GDP and gas). It compresses large values and makes trend/seasonal structure easier to compare over time.

3 Exercise 3.3

Why is Box-Cox unhelpful for canadian_gas?

Answer: The main issue is changing seasonal pattern over time, not just variance-level relationship. Box-Cox rescales variance but does not fix evolving seasonal shape.

4 Exercise 3.4

What Box-Cox transformation would you select for your retail data (Exercise 7 in Section 2.10)?

set.seed(67)
myseries <- aus_retail |> filter(`Series ID` == sample(unique(aus_retail$`Series ID`), 1))

lambda_retail <- myseries |> features(Turnover, guerrero) |> pull(lambda_guerrero)

myseries |> distinct(`Series ID`)
## # A tibble: 1 × 1
##   `Series ID`
##   <chr>      
## 1 A3349350R
lambda_retail
## [1] 0.03442716
myseries |> autoplot(Turnover) + labs(title = "Selected retail series (original)")

myseries |> autoplot(box_cox(Turnover, lambda_retail)) + labs(title = sprintf("Selected retail series (Box-Cox, lambda = %.2f)", lambda_retail))

Answer: I use lambda = 0.034 (Guerrero estimate) because it is data-driven and targets variance stabilization.

5 Exercise 3.5

Find an appropriate Box-Cox transformation for: - Tobacco from aus_production - Economy class MEL-SYD passengers from ansett - Southern Cross Station pedestrian counts from pedestrian

tobacco <- aus_production |> select(Quarter, Tobacco)
ansett_economy <- ansett |> filter(Airports == "MEL-SYD", Class == "Economy") |> select(Week, Passengers)
ped_sc <- pedestrian |> filter(Sensor == "Southern Cross Station") |> select(Date_Time, Count)

lambda_tobacco <- tobacco |> features(Tobacco, guerrero) |> pull(lambda_guerrero)
lambda_ansett <- ansett_economy |> features(Passengers, guerrero) |> pull(lambda_guerrero)
lambda_ped <- ped_sc |> features(Count, guerrero) |> pull(lambda_guerrero)

tibble(
  series = c("Tobacco", "Ansett Economy MEL-SYD", "Pedestrian Southern Cross"),
  lambda = c(lambda_tobacco, lambda_ansett, lambda_ped)
)
## # A tibble: 3 × 2
##   series                    lambda
##   <chr>                      <dbl>
## 1 Tobacco                    0.926
## 2 Ansett Economy MEL-SYD     2.00 
## 3 Pedestrian Southern Cross -0.250
tobacco |> autoplot(Tobacco) + labs(title = "Tobacco production (original)")

tobacco |> autoplot(box_cox(Tobacco, lambda_tobacco)) + labs(title = sprintf("Tobacco production (Box-Cox, lambda = %.2f)", lambda_tobacco))

ansett_economy |> autoplot(Passengers) + labs(title = "Ansett Economy MEL-SYD (original)")

ansett_economy |> autoplot(box_cox(Passengers, lambda_ansett)) + labs(title = sprintf("Ansett Economy MEL-SYD (Box-Cox, lambda = %.2f)", lambda_ansett))

ped_sc |> autoplot(Count) + labs(title = "Southern Cross pedestrians (original)")

ped_sc |> autoplot(box_cox(Count, lambda_ped)) + labs(title = sprintf("Southern Cross pedestrians (Box-Cox, lambda = %.2f)", lambda_ped))

Answer: Suggested lambdas are Tobacco 0.926, Ansett 2, and Pedestrian -0.25.

6 Exercise 3.6

Show that a \(3\times5\) MA is equivalent to a 7-term weighted moving average with weights 0.067, 0.133, 0.200, 0.200, 0.200, 0.133, and 0.067.

w3 <- rep(1/3, 3)
w5 <- rep(1/5, 5)
w35 <- as.numeric(stats::convolve(w3, rev(w5), type = "open"))

tibble(position = -3:3, exact_weight = w35, rounded_3dp = round(w35, 3))
## # A tibble: 7 × 3
##   position exact_weight rounded_3dp
##      <int>        <dbl>       <dbl>
## 1       -3       0.0667       0.067
## 2       -2       0.133        0.133
## 3       -1       0.2          0.2  
## 4        0       0.2          0.2  
## 5        1       0.2          0.2  
## 6        2       0.133        0.133
## 7        3       0.0667       0.067

Answer: The combined weights are the convolution \((1/3,1/3,1/3) * (1/5,1/5,1/5,1/5,1/5) = \frac{1}{15}(1,2,3,3,3,2,1)\), which rounds to \((0.067, 0.133, 0.200, 0.200, 0.200, 0.133, 0.067)\).

7 Exercise 3.7

Use the last five years of gas data from aus_production and apply classical multiplicative decomposition.

gas_last5 <- tail(aus_production, 20) |> select(Quarter, Gas)

gas_last5 |> autoplot(Gas) + labs(title = "Last 5 years of gas production", x = "Quarter", y = "Gas")

gas_classical <- gas_last5 |>
  model(classical_decomposition(Gas, type = "multiplicative")) |>
  components()

gas_classical |> autoplot() + labs(title = "Classical multiplicative decomposition: gas (last 5 years)")

gas_classical |> autoplot(season_adjust) + labs(title = "Seasonally adjusted gas (baseline)", y = "Seasonally adjusted Gas")

mid_idx <- round(nrow(gas_last5) / 2)
end_idx <- nrow(gas_last5) - 1

gas_mid_out <- gas_last5 |> mutate(Gas = if_else(row_number() == mid_idx, Gas + 300, Gas))
gas_end_out <- gas_last5 |> mutate(Gas = if_else(row_number() == end_idx, Gas + 300, Gas))

sa_mid <- gas_mid_out |> model(classical_decomposition(Gas, type = "multiplicative")) |> components() |> select(Quarter, sa_mid = season_adjust)
sa_end <- gas_end_out |> model(classical_decomposition(Gas, type = "multiplicative")) |> components() |> select(Quarter, sa_end = season_adjust)
sa_base <- gas_classical |> select(Quarter, sa_base = season_adjust)

autoplot(sa_base, sa_base, colour = "#1b9e77", linewidth = 0.8) +
  autolayer(sa_mid, sa_mid, colour = "#d95f02", linewidth = 0.8) +
  autolayer(sa_end, sa_end, colour = "#7570b3", linewidth = 0.8) +
  labs(
    title = "Seasonally adjusted gas under outlier scenarios",
    subtitle = "Green: Baseline | Orange: Outlier in middle | Purple: Outlier near end",
    x = "Quarter",
    y = "Seasonally adjusted Gas"
  )

Answer: Outliers distort the seasonally adjusted series. Center outliers usually contaminate more points (two-sided moving averages), while end outliers create more localized edge effects.

8 Exercise 3.8

Decompose the retail series from Exercise 7 (Section 2.10) using X-11. Identify outliers or unusual features.

x13_available <- requireNamespace("seasonal", quietly = TRUE)

if (x13_available) {
  myseries_bc <- myseries |> mutate(Turnover_bc = box_cox(Turnover, lambda_retail))
  retail_x11 <- myseries_bc |> model(x11 = X_13ARIMA_SEATS(Turnover_bc ~ x11()))
  x11_comp <- retail_x11 |> components()

  x11_comp |> autoplot() + labs(title = "X-11 decomposition of selected retail series")

  x11_comp |>
    mutate(outlier_flag = abs(irregular - median(irregular, na.rm = TRUE)) > 3 * IQR(irregular, na.rm = TRUE)) |>
    filter(outlier_flag)
} else {
  message("X-11 requires the 'seasonal' package (and X-13 binaries). Install with install.packages('seasonal').")
}
## # A dable: 14 x 10 [1M]
## # Key:     State, Industry, .model [1]
## # :        Turnover_bc = trend + seasonal + irregular
##    State    Industry        .model    Month Turnover_bc trend seasonal irregular
##    <chr>    <chr>           <chr>     <mth>       <dbl> <dbl>    <dbl>     <dbl>
##  1 Victoria Other retailing x11    1983 Mar        5.39  5.35  -0.0181    0.0627
##  2 Victoria Other retailing x11    1983 Apr        5.21  5.35  -0.0626   -0.0750
##  3 Victoria Other retailing x11    1985 Feb        5.49  5.50  -0.0660    0.0599
##  4 Victoria Other retailing x11    1988 Sep        5.72  5.81  -0.0316   -0.0632
##  5 Victoria Other retailing x11    1988 Dec        6.36  5.84   0.440     0.0783
##  6 Victoria Other retailing x11    1989 Nov        6.01  5.99   0.0770   -0.0642
##  7 Victoria Other retailing x11    1989 Dec        6.52  6.01   0.428     0.0820
##  8 Victoria Other retailing x11    1990 Jan        6.06  6.01  -0.0139    0.0618
##  9 Victoria Other retailing x11    1991 Jul        5.92  5.90  -0.0575    0.0722
## 10 Victoria Other retailing x11    1995 Nov        6.44  6.41   0.0910   -0.0622
## 11 Victoria Other retailing x11    1996 Sep        6.30  6.40  -0.0419   -0.0610
## 12 Victoria Other retailing x11    2000 Jun        6.64  6.63  -0.0697    0.0768
## 13 Victoria Other retailing x11    2011 Dec        8.12  7.60   0.445     0.0690
## 14 Victoria Other retailing x11    2012 Dec        7.91  7.45   0.393     0.0704
## # ℹ 2 more variables: season_adjust <dbl>, outlier_flag <lgl>

Answer: X-11 isolates trend/seasonal/irregular movement. Large spikes in irregular are candidate outliers and are easier to diagnose in component form.

9 Exercise 3.9

Figures 3.19 and 3.20 (Australian civilian labour force decomposition): 1. Describe the decomposition in 3-5 sentences, with attention to scales. 2. Is the 1991/1992 recession visible?

labour_candidates <- us_employment |>
  mutate(title_lower = tolower(Title)) |>
  filter(grepl("civilian labor force|civilian labour force", title_lower))

if (nrow(labour_candidates) == 0) {
  labour_candidates <- us_employment |> filter(Title == "Total Private")
}

labour_title <- labour_candidates |> count(Title, sort = TRUE) |> slice(1) |> pull(Title)

labour_series <- labour_candidates |>
  filter(Title == labour_title) |>
  select(Month, Employed)

labour_series |>
  autoplot(Employed) +
  labs(title = paste("Labour series used:", labour_title), x = "Month", y = "Employed")

labour_decomp <- labour_series |>
  model(STL(Employed ~ trend(window = 21) + season(window = "periodic"))) |>
  components()

labour_decomp |>
  autoplot() +
  labs(title = "STL decomposition of labour series")

labour_decomp |>
  autoplot(remainder) +
  labs(title = "Remainder component", x = "Month", y = "Remainder")

labour_remainder_acf <- labour_decomp |>
  ACF(remainder, lag_max = 36)

labour_remainder_acf |>
  autoplot() +
  labs(title = "ACF of remainder (diagnostic)")

labour_lb <- labour_decomp |>
  features(remainder, ljung_box, lag = 24, dof = 0)

labour_lb
## # A tibble: 1 × 3
##   .model                                                       lb_stat lb_pvalue
##   <chr>                                                          <dbl>     <dbl>
## 1 "STL(Employed ~ trend(window = 21) + season(window = \"peri…   2542.         0

Answer: The trend dominates scale, seasonal movement is smaller, and irregular captures shocks. The early-1990s recession is visible as weaker growth and negative remainder movement. If remainder ACF has significant spikes and Ljung-Box p-values are small, the remainder is not white noise, meaning some systematic structure remains (e.g., evolving seasonality, cycle effects, or structural change).