rm(list=ls()) #rm(list of objects) removes all objects from memory
library(tidyverse)
library(fpp3)
library(stargazer)

Question 1

Half-hourly electricity demand for Victoria, Australia is contained in vic_elec. Extract the January 2014 electricity demand, and aggregate this data to daily with daily total demands and maximum temperatures.

jan14_vic_elec <- vic_elec |>
  filter(yearmonth(Time) == yearmonth("2014 Jan")) |>
  index_by(Date = as_date(Time)) |>
  summarise(
    Demand = sum(Demand),
    Temperature = max(Temperature)
  )

Subpart a: plot and regression

Plot the data and find the regression model for Demand with temperature as a predictor variable. Why is there a positive relationship?

# Plot
jan14_vic_elec |>
  ggplot(aes(x = Demand, y = Temperature)) +
  labs(y = "Demand (Total electricity demand in MWh)",
       x = "Temperature (Temp. of Melbourne, BOM site 086071)") +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

#### Regression of Demand on Temperature

fit_elec <- 
  jan14_vic_elec |>
  model(TSLM(Demand ~ Temperature)) |>
  report()
Series: Demand 
Model: TSLM 

Residuals:
     Min       1Q   Median       3Q      Max 
-49978.2 -10218.9   -121.3  18533.2  35440.6 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  59083.9    17424.8   3.391  0.00203 ** 
Temperature   6154.3      601.3  10.235 3.89e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 24540 on 29 degrees of freedom
Multiple R-squared: 0.7832, Adjusted R-squared: 0.7757
F-statistic: 104.7 on 1 and 29 DF, p-value: 3.8897e-11

Subpart b

Produce a residual plot. Is the model adequate? Are there any outliers or influential observations?

# residuals
fit_elec |> gg_tsresiduals()

None of the residuals are outside of the Bartlett box, but graph of the residuals may trend slightly upwards, count of the residuals is skewed to the right which may confirm the trend. Perhaps, outliers in around Jan. 27th.

Subpart c

Use the model to forecast the electricity demand that you would expect for the next day if the maximum temperature was
15∘C and compare it with the forecast if the with maximum temperature was 35∘C. Do you believe these forecasts? The following R code will get you started:

15∘C maximum

# 15∘C maximum
jan14_vic_elec |>
  model(TSLM(Demand ~ Temperature)) |>
  forecast(
    new_data(jan14_vic_elec, 1) |>
      mutate(Temperature = 15)
  ) |>
  autoplot(jan14_vic_elec)

35∘C maximum

# 35∘C maximum
jan14_vic_elec |>
  model(TSLM(Demand ~ Temperature)) |>
  forecast(
    new_data(jan14_vic_elec, 1) |>
      mutate(Temperature = 35)
  ) |>
  autoplot(jan14_vic_elec)

Subpart d

Give prediction intervals for your forecasts. #### 15∘C maximum

jan14_vic_elec |>
  model(TSLM(Demand ~ Temperature)) |>
  forecast(
    new_data(jan14_vic_elec, 1) |>
      mutate(Temperature = 15),
    level = c(80, 95)
  ) |>
  hilo(level = c(80, 95))
# A tsibble: 1 x 7 [1D]
# Key:       .model [1]
  .model                     Date      
  <chr>                      <date>    
1 TSLM(Demand ~ Temperature) 2014-02-01
# ℹ 5 more variables: Demand <dist>, .mean <dbl>, Temperature <dbl>,
#   `80%` <hilo>, `95%` <hilo>

35∘C maximum

jan14_vic_elec |>
  model(TSLM(Demand ~ Temperature)) |>
  forecast(
    new_data(jan14_vic_elec, 1) |>
      mutate(Temperature = 35),
    level = c(80, 95)
  ) |>
  hilo(level = c(80, 95))
# A tsibble: 1 x 7 [1D]
# Key:       .model [1]
  .model                     Date      
  <chr>                      <date>    
1 TSLM(Demand ~ Temperature) 2014-02-01
# ℹ 5 more variables: Demand <dist>, .mean <dbl>, Temperature <dbl>,
#   `80%` <hilo>, `95%` <hilo>

Subpart e

Plot Demand vs Temperature for all of the available data in vic_elec aggregated to daily total demand and maximum temperature. What does this say about your model?

all_vic_elec <- vic_elec |>
  index_by(Date = as_date(Time)) |>
  summarise(
    Demand = sum(Demand),
    Temperature = max(Temperature)
  )

plot

all_vic_elec |>
  ggplot(aes(x = Demand, y = Temperature)) +
  labs(y = "Demand (Total electricity demand in MWh)",
       x = "Temperature (Temp. of Melbourne, BOM site 086071)") +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

Demand vs. temperature shows that relationsip is not linear when all data is considered.

plot

fit_all_elec <- 
  all_vic_elec |>
  model(TSLM(Demand ~ Temperature)) |>
  report()
Series: Demand 
Model: TSLM 

Residuals:
      Min        1Q    Median        3Q       Max 
-62818.41 -13699.52    -25.83  17452.70 118947.95 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 220345.8     2742.3  80.352   <2e-16 ***
Temperature    172.0      125.9   1.366    0.172    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 25470 on 1094 degrees of freedom
Multiple R-squared: 0.001702,   Adjusted R-squared: 0.0007897
F-statistic: 1.865 on 1 and 1094 DF, p-value: 0.17229

Intercept is significant, but the intercept is no longer significant.

Problem 4

The data set souvenirs concerns the monthly sales figures of a shop which opened in January 1987 and sells gifts, souvenirs, and novelties. The shop is situated on the wharf at a beach resort town in Queensland, Australia. The sales volume varies with the seasonal population of tourists. There is a large influx of visitors to the town at Christmas and for the local surfing festival, held every March since 1988. Over time, the shop has expanded its premises, range of products, and staff.

Subpart a

Produce a time plot of the data and describe the patterns in the graph. Identify any unusual or unexpected fluctuations in the time series.

souvenirs |> autoplot(Sales)

The data seems to exhibit an upward trend. The data also shows seasonality, sales seem to spike in quarter 4 of each year.

Subpart b

Explain why it is necessary to take logarithms of these data before fitting a model. The upward trend does appear to exponential, so taking the logarithm should result in a linear relationship with time.

Subpart c

Fit a regression model to the logarithms of these sales data with a linear trend, seasonal dummies and a “surfing festival” dummy variable.

fit_souvenirs <-
  souvenirs |>
  mutate(log_sales = log(Sales)) |>
  mutate(surfing_festival = if_else(year(Month) >= 1988 & month(Month) == 3, 1, 0)) |>
  model(TSLM(log_sales ~ trend() + season() + surfing_festival)) |>
  report()
Series: log_sales 
Model: TSLM 

Residuals:
      Min        1Q    Median        3Q       Max 
-0.336727 -0.127571  0.002568  0.109106  0.376714 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      7.6196670  0.0742471 102.626  < 2e-16 ***
trend()          0.0220198  0.0008268  26.634  < 2e-16 ***
season()year2    0.2514168  0.0956790   2.628 0.010555 *  
season()year3    0.2660828  0.1934044   1.376 0.173275    
season()year4    0.3840535  0.0957075   4.013 0.000148 ***
season()year5    0.4094870  0.0957325   4.277 5.88e-05 ***
season()year6    0.4488283  0.0957647   4.687 1.33e-05 ***
season()year7    0.6104545  0.0958039   6.372 1.71e-08 ***
season()year8    0.5879644  0.0958503   6.134 4.53e-08 ***
season()year9    0.6693299  0.0959037   6.979 1.36e-09 ***
season()year10   0.7473919  0.0959643   7.788 4.48e-11 ***
season()year11   1.2067479  0.0960319  12.566  < 2e-16 ***
season()year12   1.9622412  0.0961066  20.417  < 2e-16 ***
surfing_festival 0.5015151  0.1964273   2.553 0.012856 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.179 on 70 degrees of freedom
Multiple R-squared: 0.9567, Adjusted R-squared: 0.9487
F-statistic:   119 on 13 and 70 DF, p-value: < 2.22e-16

Subpart c

Plot the residuals against time and against the fitted values. Do these plots reveal any problems with the model?

fit_souvenirs |> gg_tsresiduals()

The Bartlett box shows that lags of 1, 2, 3 and 10 are outside the box. The histogram does look similar to normal, although can said slightly skewed. The mean does appear around zero.

Subpart e

Do boxplots of the residuals for each month. Does this reveal any problems with the model?

fit_souvenirs |>
  augment() |>
  mutate(month = month(Month, label = TRUE, abbr = TRUE)) |>
  ggplot(aes(x = month, y = .resid)) +
  geom_boxplot() +
  labs(
    x = "Month",
    y = "Residuals",
    title = "Residuals by month"
  )

Subpart f

What do the values of the coefficients tell you about each variable? The intecept, being the log of the sales in January is significant, and the slope of the trend is also significant.

Interestingly, March sales are not significantly different from January, but the Surfing Festival on March since 1988 is significantly different from January.

It is not surprising all the other months are significantly differnt from January based on a visual inspection of the monthly sales over time shows that January sales are the lowest for the year, excepting some March data.

Subpart g

What does the Ljung-Box test tell you about your model?

residuals(fit_souvenirs) |>
  features(.resid, ljung_box, lag = 24, dof = 14)
# A tibble: 1 × 3
  .model                                                  lb_stat lb_pvalue
  <chr>                                                     <dbl>     <dbl>
1 TSLM(log_sales ~ trend() + season() + surfing_festival)    112.         0

Because the p-vale is zero, it means to reject the null hypothesis which is the residuals are uncorrelated. Here, there seems to be some auto-correlation. The model needs some modification

Subpart h

Regardless of your answers to the above questions, use your regression model to predict the monthly sales for 1994, 1995, and 1996. Produce prediction intervals for each of your forecasts.

future_souvenirs <- tibble(
  Month = yearmonth("1994 Jan") + 0:35
) |>
  mutate(
    surfing_festival = if_else(month(Month) == 3, 1, 0)
  ) |>
  as_tsibble(index = Month)

fit_souvenirs |>
  forecast(
    new_data = future_souvenirs,
    level = c(80, 95)
  ) |>
  hilo(level = c(80, 95))
# A tsibble: 36 x 7 [1M]
# Key:       .model [1]
   .model                                                     Month
   <chr>                                                      <mth>
 1 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 Jan
 2 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 Feb
 3 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 Mar
 4 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 Apr
 5 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 May
 6 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 Jun
 7 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 Jul
 8 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 Aug
 9 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 Sep
10 TSLM(log_sales ~ trend() + season() + surfing_festival) 1994 Oct
# ℹ 26 more rows
# ℹ 5 more variables: log_sales <dist>, .mean <dbl>, surfing_festival <dbl>,
#   `80%` <hilo>, `95%` <hilo>

subpart i

How could you improve these predictions by modifying the model?

Part 5

The us_gasoline series consists of weekly data for supplies of US finished motor gasoline product, from 2 February 1991 to 20 January 2017. The units are in “million barrels per day”. Consider only the data to the end of 2004. ### Subpart a Fit a harmonic regression with trend to the data. Experiment with changing the number Fourier terms. Plot the observed gasoline and fitted values and comment on what you see.

Filter less than 2005 W01

us_gasoline_lt2005 <-
us_gasoline |>
  filter(Week < yearweek("2005 W01"))

Fit

fit_gasoline <- 
  us_gasoline_lt2005 |>
  model(
    K1 = TSLM(Barrels ~ trend() + fourier(K = 1)),
    K3 = TSLM(Barrels ~ trend() + fourier(K = 3)),
    K6 = TSLM(Barrels ~ trend() + fourier(K = 6))
  )

fit_gasoline |>
  augment() |>
  ggplot(aes(x = Week)) +
  geom_line(aes(y = Barrels), linewidth = 0.5) +
  geom_line(aes(y = .fitted), linewidth = 0.5, linetype = "dashed") +
  facet_wrap(vars(.model), scales = "free_y") +
  labs(
    title = "Observed and fitted gasoline series",
    y = "Barrels"
  )

Subpart b

Select the appropriate number of Fourier terms to include by minimising the AICc or CV value.

fit_gasoline_all <- 
  us_gasoline_lt2005 |>
    model(
        K1 = TSLM(Barrels ~ trend() + fourier(K=1)),
        K2 = TSLM(Barrels ~ trend() + fourier(K=2)),
        K3 = TSLM(Barrels ~ trend() + fourier(K=3)),
        K4 = TSLM(Barrels ~ trend() + fourier(K=4)),
        K5 = TSLM(Barrels ~ trend() + fourier(K=5)),
        K6 = TSLM(Barrels ~ trend() + fourier(K=6)),
      )

glance(fit_gasoline_all) |> select(.model, r_squared, adj_r_squared, CV, AICc)
# A tibble: 6 × 5
  .model r_squared adj_r_squared     CV   AICc
  <chr>      <dbl>         <dbl>  <dbl>  <dbl>
1 K1         0.825         0.824 0.0825 -1809.
2 K2         0.827         0.826 0.0819 -1814.
3 K3         0.837         0.835 0.0777 -1853.
4 K4         0.839         0.837 0.0773 -1856.
5 K5         0.841         0.838 0.0766 -1862.
6 K6         0.846         0.844 0.0744 -1884.

K=6 is the best model with the smallest CV and AICc.

Subpart c

Plot the residuals of the final model using the gg_tsresiduals() function and comment on these. Use a Ljung-Box test to check for residual autocorrelation. #### Fit harmonic when K=6

fit_gasoline_6 <- 
  us_gasoline_lt2005 |>
    model(
        K6 = TSLM(Barrels ~ trend() + fourier(K=6)),
      )

Plot residuals

fit_gasoline_6 |> gg_tsresiduals()

There does not seem to be trend when residuals are plotted over time. Further, the autocorrelation results are with into the Bartlett box except for one point is just above the box. Finally, the histogram of the residuals results is generally normal distribution centered around zero. The residuals look very close to white noise.

Ljung-Box test

residuals(fit_gasoline_6) |>
  features(.resid, ljung_box, lag = 24, dof = 14)
# A tibble: 1 × 3
  .model lb_stat lb_pvalue
  <chr>    <dbl>     <dbl>
1 K6        33.6  0.000218

The p-value is very small, indicating the null hypothesis should be rejected that the residuals are uncorrelated.

Subpart d

Generate forecasts for the next year of data and plot these along with the actual data for 2005. Comment on the forecasts.

future_gasoline_2005 <- tibble(
  Week = yearweek("2005 W01") + 0:51
) |>
  as_tsibble(index = Week)

fc_gasoline_2005 <- fit_gasoline_6 |>
  forecast(new_data = future_gasoline_2005)

fc_gasoline_2005 |>
  autoplot(
    us_gasoline |>
      filter(year(Week) == 2005),
    level = NULL
  ) +
  labs(
    title = "US gasoline forecasts for 2005",
    y = "Million barrels per day"
  )

Problem 6

The annual population of Afghanistan is available in the global_economy data set.