Problem Set

knitr::opts_chunk$set(echo = TRUE)
# Core packages
library(fpp3)      # tsibble, fable, feasts, distributional, ggplot2 (via fpp3)

## Warning: package 'fpp3' was built under R version 4.4.3

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.3     ✔ fable       0.4.1
## ✔ ggplot2     3.5.1

## Warning: package 'tsibble' was built under R version 4.4.3

## Warning: package 'tsibbledata' was built under R version 4.4.3

## Warning: package 'feasts' was built under R version 4.4.3

## Warning: package 'fabletools' was built under R version 4.4.3

## Warning: package 'fable' was built under R version 4.4.3

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(ggplot2)   # explicit, but fpp3 also attaches it

# Problem 1
## 1(a): Use help() interactively (commented here so knitting won't choke)
# help(aus_production)
# help(pelt)
# help(gafa_stock)
# help(vic_elec)

## 1(b): Time interval of each series
interval(aus_production)    # Bricks series interval

## <interval[1]>
## [1] 1Q

interval(pelt)              # Lynx/Hare series interval

## <interval[1]>
## [1] 1Y

interval(gafa_stock)        # Close price series interval

## <interval[1]>
## [1] !

interval(vic_elec)          # Demand series interval

## <interval[1]>
## [1] 30m

## 1(c) & 1(d): Plot each series

# Bricks from aus_production
autoplot(aus_production, Bricks) +
  ggtitle("Australian Brick Production") +
  ylab("Millions of Units") + xlab("Quarter")

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Lynx from pelt
autoplot(pelt, Lynx) +
  ggtitle("Annual Lynx Pelts (Canada)") +
  ylab("Number of Pelts") + xlab("Year")

# Close from gafa_stock
autoplot(gafa_stock, Close) +
  ggtitle("GAFA Stock Closing Prices") +
  ylab("Stock Price (USD)") + xlab("Date")

# Demand from vic_elec (customized axes and title for part d)
autoplot(vic_elec, Demand) +
  ggtitle("Electricity Demand in Victoria (Australia)") +
  ylab("Electricity Demand (MW)") + xlab("Year")

# Problem 2
# Date of peak closing price for each stock in gafa_stock (break ties consistently)
gafa_stock |>
  group_by(Symbol) |>
  slice_max(Close, n = 1, with_ties = FALSE) |>
  ungroup()

## # A tsibble: 4 x 8 [!]
## # Key:       Symbol [4]
##   Symbol Date        Open  High   Low Close Adj_Close   Volume
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1 AAPL   2018-10-03  230.  233.  230.  232.      230. 28654800
## 2 AMZN   2018-09-04 2026. 2050. 2013  2040.     2040.  5721100
## 3 FB     2018-07-25  216.  219.  214.  218.      218. 58954200
## 4 GOOG   2018-07-26 1251  1270. 1249. 1268.     1268.  2405600

# Problem 3
## 3(a): Compare arrivals from Japan, NZ, UK, US using plots

# Time series plots
autoplot(aus_arrivals, Arrivals) +
  labs(y = "Arrivals (thousands)",
       title = "Quarterly Arrivals to Australia (Japan, NZ, UK, US)")

# Seasonal plots
gg_season(aus_arrivals, Arrivals) +
  labs(y = "Arrivals (thousands)",
       title = "Seasonal Plot: International Arrivals by Country")

# Subseries plots
gg_subseries(aus_arrivals, Arrivals) +
  labs(y = "Arrivals (thousands)",
       title = "Subseries Plot: International Arrivals by Country")

# Problem 4
## 4.1 Total Private Employed (us_employment)
total_private <- us_employment |>
  filter(Title == "Total Private")

autoplot(total_private, Employed) +
  labs(title = "Total Private Employment (US)", y = "Number of Employees", x = "Year")

gg_season(total_private, Employed) +
  labs(title = "Seasonal Plot: Total Private Employment", y = "Employees")

gg_subseries(total_private, Employed) +
  labs(title = "Subseries Plot: Total Private Employment", y = "Employees")

gg_lag(total_private, Employed, geom = "point") +
  labs(title = "Lag Plot: Total Private Employment")

total_private |>
  ACF(Employed) |>
  autoplot() +
  labs(title = "ACF: Total Private Employment")

## 4.2 Bricks (aus_production)
bricks_ts <- aus_production |>
  filter(!is.na(Bricks))

autoplot(bricks_ts, Bricks) +
  labs(title = "Australian Clay Brick Production", y = "Millions of Units", x = "Year")

gg_season(bricks_ts, Bricks) +
  labs(title = "Seasonal Plot: Brick Production", y = "Millions of Units")

gg_subseries(bricks_ts, Bricks) +
  labs(title = "Subseries Plot: Brick Production", y = "Millions of Units")

gg_lag(bricks_ts, Bricks, geom = "point") +
  labs(title = "Lag Plot: Brick Production")

bricks_ts |>
  ACF(Bricks) |>
  autoplot() +
  labs(title = "ACF: Australian Brick Production")

## 4.3 Hare (pelt)
autoplot(pelt, Hare) +
  labs(title = "Hare Pelts Traded (Canada)", y = "Number of Pelts", x = "Year")

# (Seasonal and subseries plots are less informative for annual data)
gg_lag(pelt, Hare, geom = "point") +
  labs(title = "Lag Plot: Hare Pelts")

pelt |>
  ACF(Hare) |>
  autoplot() +
  labs(title = "ACF: Hare Pelts")

## 4.4 H02 Cost (PBS) — fixed
# Monthly totals (no index_by overwrite)
h02_monthly <- PBS |>
  filter(ATC2 == "H02") |>
  summarise(Cost = sum(Cost))
# Use a short alias for downstream plots
h02 <- h02_monthly

autoplot(h02, Cost) +
  labs(title = "Monthly Cost of H02 (Corticosteroids)", y = "Total Cost", x = "Year")

# Yearly totals (collapse months safely with a *new* index)
h02_yearly <- PBS |>
  filter(ATC2 == "H02") |>
  index_by(Year = year(Month)) |>
  summarise(Cost = sum(Cost))

autoplot(h02_yearly, Cost) +
  labs(title = "Annual Cost of H02 (Corticosteroids)", y = "Total Cost", x = "Year")

# Seasonality / subseries / lag / ACF on monthly data
gg_season(h02, Cost) +
  labs(title = "Seasonal Plot: H02 Cost", y = "Cost (AUD)")

gg_subseries(h02, Cost) +
  labs(title = "Subseries Plot: H02 Cost", y = "Cost (AUD)")

gg_lag(h02, Cost, geom = "point") +
  labs(title = "Lag Plot: H02 Cost")

h02 |>
  ACF(Cost) |>
  autoplot() +
  labs(title = "ACF: H02 Cost")

## 4.5 Barrels (us_gasoline)
autoplot(us_gasoline, Barrels) +
  labs(title = "US Finished Motor Gasoline Supplied",
       y = "Million Barrels/Day", x = "Year")

gg_season(us_gasoline, Barrels) +
  labs(title = "Seasonal Plot: US Gasoline Supply", y = "Million Barrels/Day")

gg_subseries(us_gasoline, Barrels) +
  labs(title = "Subseries Plot: US Gasoline Supply", y = "Million Barrels/Day")

gg_lag(us_gasoline, Barrels, geom = "point") +
  labs(title = "Lag Plot: US Gasoline Supply")

us_gasoline |>
  ACF(Barrels) |>
  autoplot() +
  labs(title = "ACF: US Gasoline Supply")

# Problem 5

## 5(a): Prepare January 2014 daily data and plot it; fit regression model
jan14_vic_elec <- vic_elec |>
  filter(yearmonth(Time) == yearmonth("2014 Jan")) |>
  index_by(Date = as_date(Time)) |>
  summarise(
    Demand = sum(Demand),
    Temperature = max(Temperature)
  )

# Show the tibble
print(jan14_vic_elec)

## # A tsibble: 31 x 3 [1D]
##    Date        Demand Temperature
##    <date>       <dbl>       <dbl>
##  1 2014-01-01 175185.        26  
##  2 2014-01-02 188351.        23  
##  3 2014-01-03 189086.        22.2
##  4 2014-01-04 173798.        20.3
##  5 2014-01-05 169733.        26.1
##  6 2014-01-06 195241.        19.6
##  7 2014-01-07 199770.        20  
##  8 2014-01-08 205339.        27.4
##  9 2014-01-09 227334.        32.4
## 10 2014-01-10 258111.        34  
## # ℹ 21 more rows

# Plot daily Demand and Temperature over time (faceted)
jan14_vic_elec |>
  pivot_longer(cols = c(Demand, Temperature), names_to = "Variable", values_to = "Value") |>
  ggplot(aes(x = Date, y = Value)) +
  geom_line() +
  facet_grid(rows = vars(Variable), scales = "free_y") +
  labs(title = "Jan 2014 Electricity Demand and Temperature", x = "Date", y = NULL)

# Scatter plot of Demand vs Temperature (Jan 2014)
ggplot(jan14_vic_elec, aes(x = Temperature, y = Demand)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Demand vs Temperature (Jan 2014)", x = "Max Temperature (°C)", y = "Daily Demand")

## `geom_smooth()` using formula = 'y ~ x'

## 5(b): Fit linear model and plot residuals
fit <- jan14_vic_elec |>
  model(TSLM(Demand ~ Temperature))

fit |>
  gg_tsresiduals()

## 5(c): Forecast next day demand for Temperature = 15°C vs 35°C
fc_15 <- fit |>
  forecast(new_data(jan14_vic_elec, 1) |> mutate(Temperature = 15))

fc_35 <- fit |>
  forecast(new_data(jan14_vic_elec, 1) |> mutate(Temperature = 35))

autoplot(fc_15, jan14_vic_elec) +
  labs(title = "Forecast with Temperature = 15°C", y = "Demand", x = "Date")

autoplot(fc_35, jan14_vic_elec) +
  labs(title = "Forecast with Temperature = 35°C", y = "Demand", x = "Date")

## 5(d): Show prediction intervals for the two forecasts
fc_tbl <- dplyr::bind_rows(
  as_tibble(hilo(fc_15, 80, 95)) |> mutate(Scenario = "Temp = 15°C"),
  as_tibble(hilo(fc_35, 80, 95)) |> mutate(Scenario = "Temp = 35°C")
)

# Expand the 80% and 95% interval columns into lower/upper bounds
## 5(d): Show prediction intervals for the two forecasts

fc_tbl <- dplyr::bind_rows(
  as_tibble(hilo(fc_15, c(80, 95))) |> mutate(Scenario = "Temp = 15°C"),
  as_tibble(hilo(fc_35, c(80, 95))) |> mutate(Scenario = "Temp = 35°C")
)

fc_clean <- fc_tbl |>
  fabletools::unpack_hilo(`80%`) |>
  fabletools::unpack_hilo(`95%`) |>
  select(Scenario, .mean, `80%_lower`, `80%_upper`, `95%_lower`, `95%_upper`)

print(fc_clean)

## # A tibble: 2 × 6
##   Scenario      .mean `80%_lower` `80%_upper` `95%_lower` `95%_upper`
##   <chr>         <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
## 1 Temp = 15°C 151398.     117908.     184889.     100179.     202617.
## 2 Temp = 35°C 274484.     242088.     306880.     224939.     324029.

## 5(e): Plot Demand vs Temperature for all available data (daily aggregates)
vic_elec_daily <- vic_elec |>
  index_by(Date = as_date(Time)) |>
  summarise(
    Demand = sum(Demand),
    Temperature = max(Temperature)
  )

# Show the tibble
print(vic_elec_daily)

## # A tsibble: 1,096 x 3 [1D]
##    Date        Demand Temperature
##    <date>       <dbl>       <dbl>
##  1 2012-01-01 222438.        32.7
##  2 2012-01-02 257965.        39.6
##  3 2012-01-03 267099.        31.8
##  4 2012-01-04 222742.        25.1
##  5 2012-01-05 210585.        21.2
##  6 2012-01-06 210247.        23.6
##  7 2012-01-07 202526.        29  
##  8 2012-01-08 193413.        27.8
##  9 2012-01-09 213804.        24  
## 10 2012-01-10 215020.        19.6
## # ℹ 1,086 more rows

# Scatter plot
ggplot(vic_elec_daily, aes(x = Temperature, y = Demand)) +
  geom_point() +
  labs(title = "Daily Demand vs Temperature (All Data)",
       x = "Max Temperature (°C)", y = "Daily Demand")

# Problem 6
## 6(a): Time plot of monthly sales
autoplot(souvenirs, Sales) +
  labs(title = "Souvenir Shop Monthly Sales", y = "Sales", x = "Year")

## 6(b): Note — Taking logs is helpful for multiplicative seasonality / variance

## 6(c): Regression on log sales with trend, seasonal dummies, and festival dummy
souvenirs <- souvenirs |>
  mutate(Festival = if_else(year(Month) >= 1988 & month(Month) == 3, 1, 0))

souvenirs_mod <- souvenirs |>
  model(TSLM(log(Sales) ~ trend() + season() + Festival))

## 6(d): Residuals vs time and vs fitted
aug <- augment(souvenirs_mod)

# Residuals over time
ggplot(aug, aes(x = Month, y = .resid)) +
  geom_line() +
  labs(title = "Residuals vs Time", x = "Year", y = "Residuals")

# Residuals vs fitted
ggplot(aug, aes(x = .fitted, y = .resid)) +
  geom_point() +
  labs(title = "Residuals vs Fitted", x = "Fitted values", y = "Residuals")

## 6(e): Boxplots of residuals by month
aug <- aug |>
  mutate(MonthFac = factor(month(Month), labels = month.abb))

ggplot(aug, aes(x = MonthFac, y = .resid)) +
  geom_boxplot() +
  labs(title = "Residuals by Month", x = "Month", y = "Residuals")

## 6(f): Model coefficients
tidy(souvenirs_mod)

## # A tibble: 14 × 6
##    .model                            term  estimate std.error statistic  p.value
##    <chr>                             <chr>    <dbl>     <dbl>     <dbl>    <dbl>
##  1 TSLM(log(Sales) ~ trend() + seas… (Int…   7.62    0.0742      103.   4.67e-78
##  2 TSLM(log(Sales) ~ trend() + seas… tren…   0.0220  0.000827     26.6  2.32e-38
##  3 TSLM(log(Sales) ~ trend() + seas… seas…   0.251   0.0957        2.63 1.06e- 2
##  4 TSLM(log(Sales) ~ trend() + seas… seas…   0.266   0.193         1.38 1.73e- 1
##  5 TSLM(log(Sales) ~ trend() + seas… seas…   0.384   0.0957        4.01 1.48e- 4
##  6 TSLM(log(Sales) ~ trend() + seas… seas…   0.409   0.0957        4.28 5.88e- 5
##  7 TSLM(log(Sales) ~ trend() + seas… seas…   0.449   0.0958        4.69 1.33e- 5
##  8 TSLM(log(Sales) ~ trend() + seas… seas…   0.610   0.0958        6.37 1.71e- 8
##  9 TSLM(log(Sales) ~ trend() + seas… seas…   0.588   0.0959        6.13 4.53e- 8
## 10 TSLM(log(Sales) ~ trend() + seas… seas…   0.669   0.0959        6.98 1.36e- 9
## 11 TSLM(log(Sales) ~ trend() + seas… seas…   0.747   0.0960        7.79 4.48e-11
## 12 TSLM(log(Sales) ~ trend() + seas… seas…   1.21    0.0960       12.6  1.29e-19
## 13 TSLM(log(Sales) ~ trend() + seas… seas…   1.96    0.0961       20.4  3.39e-31
## 14 TSLM(log(Sales) ~ trend() + seas… Fest…   0.502   0.196         2.55 1.29e- 2

## 6(g): Ljung-Box test on residuals (24 lags; dof = number of regressors incl. intercept - 1)
aug |>
  features(.resid, ljung_box, lag = 24, dof = 14)

## # A tibble: 1 × 3
##   .model                                           lb_stat lb_pvalue
##   <chr>                                              <dbl>     <dbl>
## 1 TSLM(log(Sales) ~ trend() + season() + Festival)    135.         0

## 6(h) & 6(i): Forecast monthly sales for 1994–1996 with prediction intervals
future <- new_data(souvenirs, 36) |>
  mutate(Festival = if_else(year(Month) >= 1988 & month(Month) == 3, 1, 0))

fc <- souvenirs_mod |>
  forecast(new_data = future)

autoplot(fc, souvenirs) +
  labs(title = "Forecasts for 1994–1996", y = "Sales", x = "Year")

## 6(j): Improve predictions by adding an ARIMA error term (dynamic regression)
souvenirs_mod2 <- souvenirs |>
  model(ARIMA(log(Sales) ~ trend() + season() + Festival))

report(souvenirs_mod2)

## Series: Sales 
## Model: LM w/ ARIMA(1,0,2) errors 
## Transformation: log(Sales) 
## 
## Coefficients:
##          ar1      ma1     ma2  trend()  season()year2  season()year3
##       0.5565  -0.1288  0.3404   0.0226         0.2516         0.2969
## s.e.  0.1787   0.1923  0.1139   0.0015         0.0574         0.1185
##       season()year4  season()year5  season()year6  season()year7  season()year8
##              0.3769         0.4003         0.4379         0.5978         0.5732
## s.e.         0.0729         0.0789         0.0817         0.0827         0.0821
##       season()year9  season()year10  season()year11  season()year12  Festival
##              0.6514          0.7245          1.1755          1.9274    0.4605
## s.e.         0.0799          0.0746          0.0629          0.0599    0.1192
##       intercept
##          7.6040
## s.e.     0.0857
## 
## sigma^2 estimated as 0.02022:  log likelihood=53.81
## AIC=-71.62   AICc=-61.1   BIC=-27.87

Problem Set

Mitchell Roether

2025-09-16