Hyndman Chapter 3 & 7 Exercises

Hyndman Chapter 3

1. Consider the GDP information in global_economy. Plot the GDP per capita for each country over time. Which country has the highest GDP per capita? How has this changed over time?

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggfortify)

## Loading required package: ggplot2

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(tsibble)

## 
## Attaching package: 'tsibble'

## The following object is masked from 'package:lubridate':
## 
##     interval

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(tsibbledata)
library(fpp3)

## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──

## ✓ tibble 3.1.6     ✓ feasts 0.2.2
## ✓ tidyr  1.2.0     ✓ fable  0.3.1

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## x lubridate::date()    masks base::date()
## x dplyr::filter()      masks stats::filter()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval()  masks lubridate::interval()
## x dplyr::lag()         masks stats::lag()
## x tsibble::setdiff()   masks base::setdiff()
## x tsibble::union()     masks base::union()

library(tsibble)
library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify

global_economy %>% autoplot(GDP/Population, show.legend =  FALSE)

## Warning: Removed 3242 row(s) containing missing values (geom_path).

ge1<- global_economy

max(ge1$Year)

## [1] 2017

ge2 <- ge1[ge1$Year == 2017,]

ge2$GDPPC <- ge2$GDP/ge2$Population
ge2[order(-ge2$GDPPC),]

## # A tsibble: 262 x 10 [1Y]
## # Key:       Country [262]
##    Country    Code   Year     GDP Growth   CPI Imports Exports Population  GDPPC
##    <fct>      <fct> <dbl>   <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>  <dbl>
##  1 Luxembourg LUX    2017 6.24e10   2.30 111.    194.    230.      599449 1.04e5
##  2 Macao SAR… MAC    2017 5.04e10   9.10 136.     32.0    79.4     622567 8.09e4
##  3 Switzerla… CHE    2017 6.79e11   1.09  98.3    53.9    65.0    8466017 8.02e4
##  4 Norway     NOR    2017 3.99e11   1.92 115.     33.1    35.5    5282223 7.55e4
##  5 Iceland    ISL    2017 2.39e10   3.64 122.     42.8    47.0     341284 7.01e4
##  6 Ireland    IRL    2017 3.34e11   7.80 105.     87.9   120.     4813608 6.93e4
##  7 Qatar      QAT    2017 1.67e11   1.58 116.     37.3    51.0    2639211 6.32e4
##  8 United St… USA    2017 1.94e13   2.27 112.     NA      NA    325719178 5.95e4
##  9 North Ame… NAC    2017 2.10e13   2.35  NA      NA      NA    362492702 5.81e4
## 10 Singapore  SGP    2017 3.24e11   3.62 113.    149.    173.     5612253 5.77e4
## # … with 252 more rows

ge2

## # A tsibble: 262 x 10 [1Y]
## # Key:       Country [262]
##    Country    Code   Year     GDP Growth   CPI Imports Exports Population  GDPPC
##    <fct>      <fct> <dbl>   <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>  <dbl>
##  1 Afghanist… AFG    2017 1.95e10  2.67   146.    45.3    5.90   35530081   550.
##  2 Albania    ALB    2017 1.30e10  3.84   115.    46.6   31.5     2873457  4538.
##  3 Algeria    DZA    2017 1.68e11  1.60   142.    33.5   22.6    41318142  4055.
##  4 American … ASM    2017 6.34e 8 -5.38    NA     92.7   57.4       55641 11394.
##  5 Andorra    AND    2017 3.01e 9  1.87    NA     NA     NA         76965 39147.
##  6 Angola     AGO    2017 1.22e11 -0.147  281.    23.3   29.0    29784193  4100.
##  7 Antigua a… ATG    2017 1.51e 9  3.03   112.    NA     NA        102012 14803.
##  8 Arab World ARB    2017 2.59e12  0.977   NA     40.6   43.6   414491886  6240.
##  9 Argentina  ARG    2017 6.37e11  2.85    NA     13.8   11.2    44271041 14398.
## 10 Armenia    ARM    2017 1.15e10  7.50   124.    49.5   37.3     2930450  3937.
## # … with 252 more rows

Here we can see that Luxembourg is the country with the highest GDP per capita.

2.For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect.

United States GDP from global_economy.
Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock.
Victorian Electricity Demand from vic_elec.
Gas production from aus_production.

USA <- ge1[ge1$Code == 'USA',]
USA  %>%  autoplot(GDP)

Victoria <- aus_livestock[aus_livestock$State == 'Victoria',]

VictoriaB <- Victoria[Victoria$Animal == 'Bulls, bullocks and steers',]

VictoriaB  %>%  autoplot(Count)

vic_elec  %>%  autoplot(Demand)

aus_production  %>% autoplot(Gas)

3. Why is a Box-Cox transformation unhelpful for the canadian_gas data?

canadian_gas  %>% autoplot(Volume)

l <- canadian_gas %>%
  features(Volume, features = guerrero)  %>%
  pull(lambda_guerrero)

canadian_gas %>%
  autoplot(box_cox(Volume,l))

The variance is not constant even after applying box cox.

4. What Box-Cox transformation would you select for your retail data (from Exercise 8 in Section 2.10)?

set.seed(15041207)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
  

myseries %>%
  autoplot(Turnover)

lambda_retail <- myseries %>%
features(Turnover, features = guerrero) %>%
pull(lambda_guerrero)

myseries %>%
  autoplot(box_cox(Turnover, lambda_retail))

We were able to reduce variation.

5. For the following series, find an appropriate Box-Cox transformation in order to stabilise the variance.

Tobacco from aus_production

lambda_tobacco <- aus_production %>%
features(Tobacco, features = guerrero) %>%
pull(lambda_guerrero)

aus_production %>%
autoplot(box_cox(Tobacco, lambda_tobacco))

## Warning: Removed 24 row(s) containing missing values (geom_path).

- Economy class passengers between Melbourne and Sydney from ansett

l2 <- ansett %>%
  filter(Class == "Economy",
  Airports == "MEL-SYD")%>%
  features(Passengers, features = guerrero) %>%
  pull(lambda_guerrero)
  
ansett %>%
  filter(Class == "Economy",
         Airports == "MEL-SYD")%>%
  mutate(Passengers = Passengers/1000) %>%
  autoplot(box_cox(Passengers, lambda = l2))

Pedestrian counts at Southern Cross Station from pedestrian

l3 <- pedestrian %>%
   filter(Sensor == "Southern Cross Station") %>%
   features(Count, features = guerrero) %>%
   pull(lambda_guerrero)

pedestrian %>%
  filter(Sensor == "Southern Cross Station") %>%
  autoplot(box_cox(Count,l3))

7. Consider the last five years of the Gas data from aus_production.

gas <- tail(aus_production, 5*4)

Plot the time series. Can you identify seasonal fluctuations and/or a trend-cycle?

gas  %>%
  autoplot(Gas)

We can see a positive trend and a seasonal component.

Use classical_decomposition with type=multiplicative to calculate the trend-cycle and seasonal indices.

gas %>%
  model(classical_decomposition(Gas,type = "multiplicative")) %>%
  components() %>%
  autoplot()

## Warning: Removed 2 row(s) containing missing values (geom_path).

Do the results support the graphical interpretation from part a?

Yes, the classical decomposition alowed us to see the trend and the seasonal component.

Compute and plot the seasonally adjusted data.

gp1 <- gas %>%
             model(classical_decomposition(Gas,type = "multiplicative")) %>%
             components()
gp1 %>%
  ggplot(aes(x = Quarter)) +
  geom_line(aes(y = Gas, colour = "Gas")) +
  geom_line(aes(y = season_adjust)) +
  geom_line(aes(y = trend, colour = "Trend"))

## Warning: Removed 4 row(s) containing missing values (geom_path).

Change one observation to be an outlier (e.g., add 300 to one observation), and recompute the seasonally adjusted data. What is the effect of the outlier?

gas2 <- gas
gas2$Gas[9] <- gas2$Gas[9]+500

gas2 %>%
  model(classical_decomposition(Gas,type = "multiplicative")) %>%
  components() %>%
  autoplot()

## Warning: Removed 2 row(s) containing missing values (geom_path).

We can see clearly where is the outlier.Only one data was able to change even the trend component.

Does it make any difference if the outlier is near the end rather than in the middle of the time series?

gas3 <- gas
gas3$Gas[19] <- gas3$Gas[19]+500

gas3 %>%
  model(classical_decomposition(Gas,type = "multiplicative")) %>%
  components() %>%
  autoplot()

## Warning: Removed 2 row(s) containing missing values (geom_path).

Yes, there is a difference. The seasonal component changes a little bit. We can also see that the trend is similar to the first one (without the outlier).

8. Recall your retail time series data (from Exercise 8 in Section 2.10). Decompose the series using X-11. Does it reveal any outliers, or unusual features that you had not noticed previously?

myseries %>%
  model(classical_decomposition(Turnover,type = "multiplicative")) %>%
  components() %>%
  autoplot()

## Warning: Removed 6 row(s) containing missing values (geom_path).

At the end of 2009 we can see an increase in the series and a decrease at the beginning of 2010.

It is not that clear if we can treat that as an outlier but the X-11 method allow us to notice that.

9. Figures 3.19 and 3.20 show the result of decomposing the number of persons in the civilian labour force in Australia each month from February 1978 to August 1995.

Write about 3–5 sentences describing the results of the decomposition. Pay particular attention to the scales of the graphs in making your interpretation.

The classical decomposition show us a clear decreasing trend over time. We can also see the seasonal component. This is telling us the importance of controlling for those factors when we are trying to forecast the series. The next step would be the identification of the periodicity of the seasonal component. Graph 2 is telling us that we have differences between months, but we have to take a look at each trimester for example.

Is the recession of 1991/1992 visible in the estimated components?

We see an small decrease in those years by taking a look on the random component.

This exercise uses the canadian_gas data (monthly Canadian gas production in billions of cubic metres, January 1960 – February 2005).

Plot the data using autoplot(), gg_subseries() and gg_season() to look at the effect of the changing seasonality over time.1

canadian_gas %>%
  autoplot(Volume)+
  labs(title = "Monthly Canadian Gas Production")

canadian_gas %>%
  gg_subseries(Volume)+
  labs(title = "Monthly Canadian Gas Production")

canadian_gas %>%
  gg_season(Volume)+
  labs(title = "Monthly Canadian Gas Production")

Hyndman Chapter 7

Half-hourly electricity demand for Victoria, Australia is contained in vic_elec. Extract the January 2014 electricity demand, and aggregate this data to daily with daily total demands and maximum temperatures.

jan14_vic_elec <- vic_elec %>%
  filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
  index_by(Date = as_date(Time)) %>%
  summarise(
    Demand = sum(Demand),
    Temperature = max(Temperature)
  )

Plot the data and find the regression model for Demand with temperature as a predictor variable. Why is there a positive relationship?

jan14_vic_elec %>%
  as.data.frame() %>%
  ggplot(aes(x=Temperature, y=Demand)) +
    ylab("Electricity Demand") +
    xlab("Temperature") +
    geom_point() +
    geom_smooth(method="lm", se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

model<-lm(Demand ~ Temperature, data=jan14_vic_elec)
summary(model)

## 
## Call:
## lm(formula = Demand ~ Temperature, data = jan14_vic_elec)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -49978 -10219   -121  18533  35441 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  59083.9    17424.8   3.391  0.00203 ** 
## Temperature   6154.3      601.3  10.235 3.89e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24540 on 29 degrees of freedom
## Multiple R-squared:  0.7832, Adjusted R-squared:  0.7757 
## F-statistic: 104.7 on 1 and 29 DF,  p-value: 3.89e-11

The graph show us a positive relationship between temperature and electricity demand. The regression show highly significant coefficients.

Produce a residual plot. Is the model adequate? Are there any outliers or influential observations?

model.res = resid(model)
Fitted = fitted(model)
 plot(model.res , Fitted)

It its not that clear the relation between the fitted values against the residuals.

 plot(model.res , jan14_vic_elec$Demand)

Here looks like the residuals are correlated with the Demand.

Use the model to forecast the electricity demand that you would expect for the next day if the maximum temperature was 15 C and compare it with the forecast if the with maximum temperature was 35 C Do you believe these forecasts? The following R code will get you started:

jan14_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan14_vic_elec, 1) %>%
      mutate(Temperature = 15)
  ) %>%
  autoplot(jan14_vic_elec)

model2 <- forecast(model, newdata=data.frame(Temperature=c(15,35)))
model2

##           Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 151398.35       151398.4 117127.2 185669.5  97951.22 204845.5
## 274484.25       274484.2 241333.0 307635.5 222783.69 326184.8

Give prediction intervals for your forecasts.

model2$lower[,1]

##        1        2 
## 117127.2 241333.0

model2$upper[,1]

##        1        2 
## 185669.5 307635.5

Plot Demand vs Temperature for all of the available data in vic_elec aggregated to daily total demand and maximum temperature. What does this say about your model?

vic_elec %>%
  as.data.frame() %>%
  ggplot(aes(x=Temperature, y=Demand)) +
    ylab("Electricity Demand") +
    xlab("Temperature") +
    geom_point()

There is not a linear relationship between these two variables.We may take in account some other factors in modeling this.

Data set olympic_running contains the winning times (in seconds) in each Olympic Games sprint, middle-distance and long-distance track events from 1896 to 2016.

-Plot the winning time against the year for each event. Describe the main features of the plot.

olympic_running

## # A tsibble: 312 x 4 [4Y]
## # Key:       Length, Sex [14]
##     Year Length Sex    Time
##    <int>  <int> <chr> <dbl>
##  1  1896    100 men    12  
##  2  1900    100 men    11  
##  3  1904    100 men    11  
##  4  1908    100 men    10.8
##  5  1912    100 men    10.8
##  6  1916    100 men    NA  
##  7  1920    100 men    10.8
##  8  1924    100 men    10.6
##  9  1928    100 men    10.8
## 10  1932    100 men    10.3
## # … with 302 more rows

olympic_running %>% autoplot(Time, show.legend =  TRUE)

Fit a regression line to the data for each event. Obviously the winning times have been decreasing, but at what average rate per year?

OR<-ts(olympic_running , frequency=1)
model3 <- tslm(Time ~ trend + Year, data = OR)
summary(model3)

## 
## Call:
## tslm(formula = Time ~ trend + Year, data = OR)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -479.91 -291.19   -9.71  212.57  893.18 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2236.9636  1261.4729   1.773   0.0773 .  
## trend          4.8262     0.2384  20.246   <2e-16 ***
## Year          -1.3415     0.6440  -2.083   0.0382 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 354.9 on 278 degrees of freedom
##   (31 observations deleted due to missingness)
## Multiple R-squared:  0.5986, Adjusted R-squared:  0.5957 
## F-statistic: 207.3 on 2 and 278 DF,  p-value: < 2.2e-16

We can see a negative relationship between the trend and the winning time.

Plot the residuals against the year. What does this indicate about the suitability of the fitted lines?

cbind(Time = olympic_running$Year, Residuals = model3$residuals) %>%
  as.data.frame() %>%
  ggplot(aes(x = Time, y = Residuals)) +
  geom_point() +
  ylab("Residuals")

## Warning: Removed 31 rows containing missing values (geom_point).

We can see that there is not a clear relationship between the residuals and the year. The model may be well specified.

Predict the winning time for each race in the 2020 Olympics. Give a prediction interval for your forecasts. What assumptions have you made in these calculations?

forecast1 <- forecast(model3,newdata = data.frame(Year = 2020))

forecast1$upper

## Time Series:
## Start = 313 
## End = 313 
## Frequency = 1 
##     Series 1 Series 2
## 313 1498.001 1743.049

forecast1$lower

## Time Series:
## Start = 313 
## End = 313 
## Frequency = 1 
##     Series 1 Series 2
## 313 577.4691 332.4212

The data set souvenirs concerns the monthly sales figures of a shop which opened in January 1987 and sells gifts, souvenirs, and novelties. The shop is situated on the wharf at a beach resort town in Queensland, Australia. The sales volume varies with the seasonal population of tourists. There is a large influx of visitors to the town at Christmas and for the local surfing festival, held every March since 1988. Over time, the shop has expanded its premises, range of products, and staff.

Produce a time plot of the data and describe the patterns in the graph. Identify any unusual or unexpected fluctuations in the time series.

autoplot(souvenirs)

## Plot variable not specified, automatically selected `.vars = Sales`

As we can see, we have a positive trend and a seasonal component.

Explain why it is necessary to take logarithms of these data before fitting a model.

Log transformation reduces or removes the skewness of our original data.

Fit a regression model to the logarithms of these sales data with a linear trend, seasonal dummies and a “surfing festival” dummy variable.

souvts<- ts(souvenirs$Sales,start=c(1987,1), end=c(1993,6), frequency=12)
lnsouvts = log(souvts)
dummyvar = rep(0, length(souvts))
dummyvar[seq_along(dummyvar)%%12 == 3] = 1
dummyvar[3] = 0
dummyvar = ts(dummyvar, freq = 12, start=c(1987,1))
new.data = data.frame(lnsouvts, dummyvar)

fit = tslm(lnsouvts ~ trend + season + dummyvar, data=new.data)

fore.data = data.frame(dummyvar = rep(0, 12))
fore.data[3,] = 1
forecast(fit, newdata=fore.data)

##          Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## Jul 1993       9.883136  9.637329 10.128943  9.503919 10.262353
## Aug 1993       9.867772  9.621965 10.113579  9.488555 10.246989
## Sep 1993      10.531942 10.191069 10.872815 10.006062 11.057822
## Oct 1993      10.092605  9.846798 10.338412  9.713388 10.471822
## Nov 1993      10.585189 10.339382 10.830995 10.205971 10.964406
## Dec 1993      11.357556 11.111749 11.603363 10.978339 11.736773
## Jan 1994       9.430933  9.186093  9.675774  9.053207  9.808659
## Feb 1994       9.704370  9.459530  9.949210  9.326644 10.082096
## Mar 1994       9.695742  9.365756 10.025727  9.186658 10.204825
## Apr 1994       9.881046  9.636206 10.125887  9.503320 10.258772
## May 1994       9.928500  9.683659 10.173340  9.550774 10.306226
## Jun 1994       9.989861  9.745020 10.234701  9.612135 10.367587

Plot the residuals against time and against the fitted values. Do these plots reveal any problems with the model?

autoplot(fit$residuals)

The series looks like white noise so tha log transformation was helpfull.

Do boxplots of the residuals for each month. Does this reveal any problems with the model?

boxplot(residuals(fit)~cycle(residuals(fit)))

What do the values of the coefficients tell you about each variable?

fit$coefficients

## (Intercept)       trend     season2     season3     season4     season5 
##   7.6662400   0.0207611   0.2526756   0.2232860   0.3878297   0.4145219 
##     season6     season7     season8     season9    season10    season11 
##   0.4551219   0.5767690   0.5406444   0.6296713   0.7239552   1.1957774 
##    season12    dummyvar 
##   1.9473841   0.5543818

We can see a clear relationship between the season and the series.

What does the Ljung-Box test tell you about your model?

checkresiduals(fit)

## 
##  Breusch-Godfrey test for serial correlation of order up to 17
## 
## data:  Residuals from Linear regression model
## LM test = 31.535, df = 17, p-value = 0.01718

We can reject h0 and say that we have autocorrelation. We have to try some other methods in order to solve this.

Regardless of your answers to the above questions, use your regression model to predict the monthly sales for 1994, 1995, and 1996. Produce prediction intervals for each of your forecasts.

fore.data = data.frame(dummyvar = rep(0, 36))
preds = forecast(fit, newdata=fore.data)
preds

##          Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## Jul 1993       9.883136  9.637329 10.128943  9.503919 10.262353
## Aug 1993       9.867772  9.621965 10.113579  9.488555 10.246989
## Sep 1993       9.977560  9.731753 10.223367  9.598343 10.356777
## Oct 1993      10.092605  9.846798 10.338412  9.713388 10.471822
## Nov 1993      10.585189 10.339382 10.830995 10.205971 10.964406
## Dec 1993      11.357556 11.111749 11.603363 10.978339 11.736773
## Jan 1994       9.430933  9.186093  9.675774  9.053207  9.808659
## Feb 1994       9.704370  9.459530  9.949210  9.326644 10.082096
## Mar 1994       9.695742  9.365756 10.025727  9.186658 10.204825
## Apr 1994       9.881046  9.636206 10.125887  9.503320 10.258772
## May 1994       9.928500  9.683659 10.173340  9.550774 10.306226
## Jun 1994       9.989861  9.745020 10.234701  9.612135 10.367587
## Jul 1994      10.132269  9.883394 10.381144  9.748319 10.516219
## Aug 1994      10.116905  9.868031 10.365780  9.732955 10.500856
## Sep 1994      10.226693  9.977819 10.475568  9.842743 10.610644
## Oct 1994      10.341738 10.092864 10.590613  9.957788 10.725689
## Nov 1994      10.834322 10.585447 11.083197 10.450372 11.218272
## Dec 1994      11.606690 11.357815 11.855564 11.222739 11.990640
## Jan 1995       9.680067  9.431764  9.928369  9.296999 10.063134
## Feb 1995       9.953503  9.705201 10.201806  9.570436 10.336570
## Mar 1995       9.944875  9.610605 10.279144  9.429182 10.460567
## Apr 1995      10.130180  9.881877 10.378482  9.747112 10.513247
## May 1995      10.177633  9.929330 10.425935  9.794566 10.560700
## Jun 1995      10.238994  9.990691 10.487296  9.855927 10.622061
## Jul 1995      10.381402 10.128745 10.634059  9.991617 10.771188
## Aug 1995      10.366039 10.113381 10.618696  9.976253 10.755824
## Sep 1995      10.475827 10.223169 10.728484 10.086041 10.865612
## Oct 1995      10.590872 10.338214 10.843529 10.201086 10.980657
## Nov 1995      11.083455 10.830798 11.336112 10.693669 11.473240
## Dec 1995      11.855823 11.603165 12.108480 11.466037 12.245608
## Jan 1996       9.929200  9.676730 10.181669  9.539704 10.318696
## Feb 1996      10.202636  9.950167 10.455106  9.813141 10.592132
## Mar 1996      10.194008  9.854949 10.533067  9.670926 10.717090
## Apr 1996      10.379313 10.126843 10.631782  9.989817 10.768809
## May 1996      10.426766 10.174296 10.679236 10.037270 10.816262
## Jun 1996      10.488127 10.235658 10.740597 10.098631 10.877623

How could you improve these predictions by modifying the model?

We should use an ARMA model in order to control for the autocorrelation that we found earlier.

The us_gasoline series consists of weekly data for supplies of US finished motor gasoline product, from 2 February 1991 to 20 January 2017. The units are in “million barrels per day”. Consider only the data to the end of 2004.

Fit a harmonic regression with trend to the data. Experiment with changing the number Fourier terms. Plot the observed gasoline and fitted values and comment on what you see.

autoplot(us_gasoline)

## Plot variable not specified, automatically selected `.vars = Barrels`

gas <- ts(us_gasoline$Barrels, start=(1991-1), end=(2017-1),frequency = 52)

gas <- window(gas, end = 2005)
for(num in c(1, 2, 3, 5, 10)){
  var_name <- paste("fit",
                    as.character(num),
                    "_gasoline_2004",
                    sep = "")
  
  assign(var_name,
         tslm(gas ~ trend + fourier(
           gas,
           K = num
         ))
  )
  
  print(
    autoplot(gas) +
      autolayer(get(var_name)$fitted.values,
                series = as.character(num)) +
      ggtitle(var_name) +
      ylab("gasoline") 
  )
}

Select the appropriate number of Fourier terms to include by minimising the AICc or CV value

for(i in c(1, 2, 3, 5, 10)){
  fit_gasoline_2004.name <- paste(
    "fit", as.character(i), "_gasoline_2004",
    sep = ""
  )
  writeLines(
    paste(
      "\n", fit_gasoline_2004.name, "\n"
    )
  )
  print(CV(get(fit_gasoline_2004.name)))
}

## 
##  fit1_gasoline_2004 
## 
##            CV           AIC          AICc           BIC         AdjR2 
##  8.012058e-02 -1.969129e+03 -1.969052e+03 -1.945827e+03  8.427750e-01 
## 
##  fit2_gasoline_2004 
## 
##            CV           AIC          AICc           BIC         AdjR2 
##  7.919464e-02 -1.978217e+03 -1.978072e+03 -1.945593e+03  8.449887e-01 
## 
##  fit3_gasoline_2004 
## 
##            CV           AIC          AICc           BIC         AdjR2 
##  7.470266e-02 -2.023838e+03 -2.023604e+03 -1.981892e+03  8.541546e-01 
## 
##  fit5_gasoline_2004 
## 
##            CV           AIC          AICc           BIC         AdjR2 
##  7.365185e-02 -2.034993e+03 -2.034518e+03 -1.974405e+03  8.569479e-01 
## 
##  fit10_gasoline_2004 
## 
##            CV           AIC          AICc           BIC         AdjR2 
##  7.173572e-02 -2.056054e+03 -2.054595e+03 -1.948861e+03  8.624864e-01

The annual population of Afghanistan is available in the global_economy data set.

Plot the data and comment on its features. Can you observe the effect of the Soviet-Afghan war?

global_economy %>%
  filter(Country=="Afghanistan") %>%
  tsibble(key = Code, index = Year)%>%
  autoplot(Population, show.legend =  FALSE)

Fit a linear trend model and compare this to a piecewise linear trend model with knots at 1980 and 1989.

global_economy %>%
  filter(Country=="Afghanistan")%>%
  model(TSLM(Population ~ Year)) %>%
  report()

## Series: Population 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5794518 -2582559   744761  2259222  6036280 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -829292529   49730866  -16.68   <2e-16 ***
## Year            425774      25008   17.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3188000 on 56 degrees of freedom
## Multiple R-squared: 0.8381,  Adjusted R-squared: 0.8352
## F-statistic: 289.9 on 1 and 56 DF, p-value: < 2.22e-16

global_economy %>%
  filter(Country=="Afghanistan")%>%
  filter(Year<1980)%>%
  model(TSLM(Population ~ Year)) %>%
  report()

## Series: Population 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -146380.8 -110290.6    -451.2  105877.8  202881.3 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -470734527    8787062  -53.57   <2e-16 ***
## Year            244657       4462   54.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115100 on 18 degrees of freedom
## Multiple R-squared: 0.994,   Adjusted R-squared: 0.9937
## F-statistic:  3007 on 1 and 18 DF, p-value: < 2.22e-16

Generate forecasts from these two models for the five years after the end of the data, and comment on the results.

model.fc1<-global_economy %>%
  filter(Country=="Afghanistan")%>%
  model(TSLM(Population ~ Year)) 

model.fc2<-global_economy %>%
  filter(Country=="Afghanistan")%>%
  filter(Year>1989)%>%
  model(TSLM(Population ~ Year)) 

forecast(model.fc1, h=5)

## # A fable: 5 x 5 [1Y]
## # Key:     Country, .model [1]
##   Country     .model                   Year          Population     .mean
##   <fct>       <chr>                   <dbl>              <dist>     <dbl>
## 1 Afghanistan TSLM(Population ~ Year)  2018   N(3e+07, 1.1e+13) 29919575.
## 2 Afghanistan TSLM(Population ~ Year)  2019   N(3e+07, 1.1e+13) 30345349.
## 3 Afghanistan TSLM(Population ~ Year)  2020 N(3.1e+07, 1.1e+13) 30771123.
## 4 Afghanistan TSLM(Population ~ Year)  2021 N(3.1e+07, 1.1e+13) 31196897.
## 5 Afghanistan TSLM(Population ~ Year)  2022 N(3.2e+07, 1.1e+13) 31622671.

forecast(model.fc2, h=5)

## # A fable: 5 x 5 [1Y]
## # Key:     Country, .model [1]
##   Country     .model                   Year          Population     .mean
##   <fct>       <chr>                   <dbl>              <dist>     <dbl>
## 1 Afghanistan TSLM(Population ~ Year)  2018 N(3.6e+07, 1.4e+11) 35925602.
## 2 Afghanistan TSLM(Population ~ Year)  2019 N(3.7e+07, 1.4e+11) 36770747.
## 3 Afghanistan TSLM(Population ~ Year)  2020 N(3.8e+07, 1.4e+11) 37615892.
## 4 Afghanistan TSLM(Population ~ Year)  2021 N(3.8e+07, 1.5e+11) 38461037.
## 5 Afghanistan TSLM(Population ~ Year)  2022 N(3.9e+07, 1.5e+11) 39306182.

We get higher results with the second specification.